From d886cf2feaea2819f6943b601ef63aff4c95ad7f Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Tue, 24 Jan 2023 09:57:33 +0000
Subject: [PATCH 01/37] Use primary context when setting up pycuda-related
 tests (#468)

* Use primary context when setting up pycuda-related tests

* Set context to None in tests tear-down as what has been done in pycuda

* Use primary context for multi-gpu mpi tests
---
 .../cuda_pycuda_tests/__init__.py             | 10 ++++++--
 .../cuda_pycuda_tests/multi_gpu_test.py       | 23 ++++++++++++-------
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/test/accelerate_tests/cuda_pycuda_tests/__init__.py b/test/accelerate_tests/cuda_pycuda_tests/__init__.py
index 04582430f..ab94a4eb6 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/__init__.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/__init__.py
@@ -26,7 +26,13 @@ class PyCudaTest(unittest.TestCase):
     def setUp(self):
         import sys
         np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf)
-        self.ctx = make_default_context()
+
+        def _retain_primary_context(dev):
+            ctx = dev.retain_primary_context()
+            ctx.push()
+            return ctx
+        self.ctx = make_default_context(_retain_primary_context)
+
         self.stream = cuda.Stream()
         # enable assertions in CUDA kernels for testing
         if not 'perf' in self._testMethodName:
@@ -37,7 +43,7 @@ def setUp(self):
     def tearDown(self):
         np.set_printoptions()
         self.ctx.pop()
-        self.ctx.detach()
+        self.ctx = None
         if not 'perf' in self._testMethodName:
            cuda_pycuda.debug_options = self.opts_old
 
diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
index 64cc5110d..fdc34a528 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
@@ -9,12 +9,13 @@
 if have_pycuda():
     from pycuda import gpuarray
     import pycuda.driver as cuda
+    from pycuda.tools import make_default_context
     from ptypy.accelerate.cuda_pycuda import multi_gpu as mgpu
     from ptypy.utils import parallel
 
 from pkg_resources import parse_version
 
-class GpuDataTest(PyCudaTest):
+class GpuDataTest(unittest.TestCase):
     """
     This is a test class for MPI - to really check if it all works, it needs
     to be run as:
@@ -27,20 +28,26 @@ class GpuDataTest(PyCudaTest):
 
     needs to be set, mpi4py version 3.1.0+ used, a pycuda build from master,
     and a cuda-aware MPI version.
+
+    To check if it is a cuda-aware MPI version:
+        ompi_info --parsable --all | grep mpi_built_with_cuda_support:value
     """
 
     def setUp(self):
         if parallel.rank_local < cuda.Device.count():
-            self.device = cuda.Device(parallel.rank_local)
-            self.ctx = self.device.make_context()
-            self.ctx.push()
+            def _retain_primary_context(dev):
+                ctx = dev.retain_primary_context()
+                ctx.push()
+                return ctx
+            self.ctx = make_default_context(_retain_primary_context)
+            self.device = self.ctx.get_device()
         else:
             self.ctx = None
 
     def tearDown(self):
         if self.ctx is not None:
             self.ctx.pop()
-            self.ctx.detach()
+            self.ctx = None
 
     @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0")
     def test_version(self):
@@ -53,7 +60,7 @@ def test_compute_mode(self):
         attr = cuda.Context.get_device().get_attributes()
         self.assertIn(cuda.device_attribute.COMPUTE_MODE, attr)
         mode = attr[cuda.device_attribute.COMPUTE_MODE]
-        self.assertIn(mode, 
+        self.assertIn(mode,
             [cuda.compute_mode.DEFAULT, cuda.compute_mode.PROHIBITED, cuda.compute_mode.EXCLUSIVE_PROCESS]
         )
 
@@ -71,7 +78,7 @@ def multigpu_tester(self, com):
 
     def test_multigpu_auto(self):
         self.multigpu_tester(mgpu.get_multi_gpu_communicator())
-        
+
     def test_multigpu_mpi(self):
         self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi())
 
@@ -81,4 +88,4 @@ def test_multigpu_cudampi(self):
 
     @unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
     def test_multigpu_nccl(self):
-        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
\ No newline at end of file
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())

From 44e505515f7cb979a697df029881fa2644ec48eb Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Tue, 24 Jan 2023 13:24:37 +0000
Subject: [PATCH 02/37] CuPy backend (#469)

* Cupy backend + most array utils kernels passing / tested

* Import pybind11 later, to allow setup to run without and install the dependency

* Adds all elementary kernels + tests to cupy backend

* Adds complete cupy-based engines

* make sure headers (.cuh) are copied to build dir

* Finalising changes for the cupy engines

* Adds cupy template for minimal prep and run

* remove text from release notes

Co-authored-by: Team GPU <team-gpu@xcelerit.com>
---
 .gitignore                                    |    1 +
 cufft/extensions.py                           |    2 +-
 cufft/setup.py                                |    1 +
 ptypy/__init__.py                             |    7 +-
 .../cuda => cuda_common}/__init__.py          |    0
 .../cuda => cuda_common}/abs2sum.cu           |    3 +-
 .../cuda => cuda_common}/batched_multiply.cu  |    3 +-
 .../cuda => cuda_common}/build_aux.cu         |    0
 .../cuda => cuda_common}/build_aux_no_ex.cu   |    3 +-
 .../build_aux_position_correction.cu          |    3 +-
 .../cuda => cuda_common}/build_exit.cu        |    0
 .../build_exit_alpha_tau.cu                   |    0
 .../cuda => cuda_common}/clip_magnitudes.cu   |    5 +-
 ptypy/accelerate/cuda_common/common.cuh       |   12 +
 .../cuda => cuda_common}/convolution.cu       |    3 +-
 .../{cuda_pycuda/cuda => cuda_common}/delx.cu |    3 +-
 .../{cuda_pycuda/cuda => cuda_common}/dot.cu  |    4 +-
 .../cuda => cuda_common}/error_reduce.cu      |    0
 .../cuda => cuda_common}/exit_error.cu        |    7 +-
 .../cuda => cuda_common}/fill3D.cu            |    4 +-
 .../cuda => cuda_common}/fill_b.cu            |    0
 .../cuda => cuda_common}/fmag_all_update.cu   |    5 +-
 .../fmag_update_nopbound.cu                   |    5 +-
 .../cuda => cuda_common}/fourier_deviation.cu |    7 +-
 .../cuda => cuda_common}/fourier_error.cu     |    7 +-
 .../cuda => cuda_common}/fourier_error2.cu    |    5 +-
 .../cuda => cuda_common}/fourier_update.cu    |    5 +-
 .../cuda => cuda_common}/full_reduce.cu       |    2 -
 .../cuda => cuda_common}/gd_main.cu           |    3 +-
 .../cuda => cuda_common}/get_address.cu       |    4 +-
 .../cuda => cuda_common}/intens_renorm.cu     |    3 +-
 .../interpolated_shift.cu                     |    8 +-
 .../cuda => cuda_common}/log_likelihood.cu    |    7 +-
 .../cuda => cuda_common}/make_a012.cu         |    3 +-
 .../cuda => cuda_common}/make_aux.cu          |    3 +-
 .../cuda => cuda_common}/make_exit.cu         |    3 +-
 .../cuda => cuda_common}/make_model.cu        |    3 +-
 .../cuda => cuda_common}/mass_center.cu       |    0
 .../cuda => cuda_common}/max_abs2.cu          |    5 +-
 .../cuda => cuda_common}/ob_norm_local.cu     |    7 +-
 .../cuda => cuda_common}/ob_update.cu         |    3 +-
 .../cuda => cuda_common}/ob_update2.cu        |    4 +-
 .../cuda => cuda_common}/ob_update2_ML.cu     |    4 +-
 .../cuda => cuda_common}/ob_update_ML.cu      |    3 +-
 .../cuda => cuda_common}/ob_update_local.cu   |    3 +-
 .../cuda => cuda_common}/pr_norm_local.cu     |    7 +-
 .../cuda => cuda_common}/pr_update.cu         |    3 +-
 .../cuda => cuda_common}/pr_update2.cu        |    4 +-
 .../cuda => cuda_common}/pr_update2_ML.cu     |    4 +-
 .../cuda => cuda_common}/pr_update_ML.cu      |    3 +-
 .../cuda => cuda_common}/pr_update_local.cu   |    3 +-
 .../cuda => cuda_common}/transpose.cu         |    3 +-
 .../update_addr_error_state.cu                |    4 +-
 ptypy/accelerate/cuda_common/utils.py         |   18 +
 ptypy/accelerate/cuda_cupy/__init__.py        |   73 +
 .../accelerate/cuda_cupy/address_manglers.py  |   81 +
 ptypy/accelerate/cuda_cupy/array_utils.py     |  670 ++++++++
 ptypy/accelerate/cuda_cupy/cufft.py           |  171 +++
 ptypy/accelerate/cuda_cupy/dependencies.yml   |   17 +
 ptypy/accelerate/cuda_cupy/engines/ML_cupy.py |  804 ++++++++++
 .../accelerate/cuda_cupy/engines/__init__.py  |    0
 .../cuda_cupy/engines/projectional_cupy.py    |  636 ++++++++
 .../engines/projectional_cupy_stream.py       |  556 +++++++
 .../cuda_cupy/engines/stochastic.py           |  550 +++++++
 ptypy/accelerate/cuda_cupy/kernels.py         | 1345 +++++++++++++++++
 ptypy/accelerate/cuda_cupy/mem_utils.py       |  319 ++++
 ptypy/accelerate/cuda_cupy/multi_gpu.py       |  151 ++
 ptypy/accelerate/cuda_cupy/porting_notes.md   |   60 +
 ptypy/accelerate/cuda_pycuda/__init__.py      |    9 +-
 ptypy/accelerate/cuda_pycuda/array_utils.py   |   19 +-
 pyproject.toml                                |    2 +-
 .../ptypy_minimal_prep_and_run_cupy.py        |   54 +
 .../cuda_cupy_tests/__init__.py               |   33 +
 .../cuda_cupy_tests/address_manglers_test.py  |   77 +
 .../cuda_cupy_tests/array_utils_test.py       |  536 +++++++
 .../auxiliary_wave_kernel_test.py             |  666 ++++++++
 .../derivatives_kernel_test.py                |  330 ++++
 .../cuda_cupy_tests/engine_tests.py           |  172 +++
 .../cuda_cupy_tests/engine_utils_test.py      |   52 +
 .../cuda_cupy_tests/fft_scaling_test.py       |  204 +++
 .../cuda_cupy_tests/fft_setstream_test.py     |   97 ++
 .../fourier_update_kernel_test.py             |  685 +++++++++
 .../cuda_cupy_tests/gpudata_test.py           |  265 ++++
 .../gradient_descent_kernel_test.py           |  327 ++++
 .../cuda_cupy_tests/import_test.py            |   10 +
 .../cuda_cupy_tests/multi_gpu_test.py         |   74 +
 .../cuda_cupy_tests/po_update_kernel_test.py  |  943 ++++++++++++
 .../position_correction_kernel_test.py        |  149 ++
 .../propagation_kernel_test.py                |  157 ++
 .../cuda_pycuda_tests/fft_setstream_test.py   |    1 +
 90 files changed, 10351 insertions(+), 156 deletions(-)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/__init__.py (100%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/abs2sum.cu (94%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/batched_multiply.cu (95%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_aux.cu (100%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_aux_no_ex.cu (98%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_aux_position_correction.cu (96%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_exit.cu (100%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_exit_alpha_tau.cu (100%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/clip_magnitudes.cu (86%)
 create mode 100644 ptypy/accelerate/cuda_common/common.cuh
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/convolution.cu (99%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/delx.cu (99%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/dot.cu (93%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/error_reduce.cu (100%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/exit_error.cu (91%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fill3D.cu (95%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fill_b.cu (100%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fmag_all_update.cu (95%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fmag_update_nopbound.cu (95%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_deviation.cu (93%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_error.cu (93%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_error2.cu (96%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_update.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/full_reduce.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/gd_main.cu (95%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/get_address.cu (93%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/intens_renorm.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/interpolated_shift.cu (98%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/log_likelihood.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_a012.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_aux.cu (98%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_exit.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_model.cu (93%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/mass_center.cu (100%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/max_abs2.cu (96%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_norm_local.cu (93%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update2.cu (98%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update2_ML.cu (98%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update_ML.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update_local.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_norm_local.cu (93%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update2.cu (98%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update2_ML.cu (98%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update_ML.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update_local.cu (97%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/transpose.cu (96%)
 rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/update_addr_error_state.cu (94%)
 create mode 100644 ptypy/accelerate/cuda_common/utils.py
 create mode 100644 ptypy/accelerate/cuda_cupy/__init__.py
 create mode 100644 ptypy/accelerate/cuda_cupy/address_manglers.py
 create mode 100644 ptypy/accelerate/cuda_cupy/array_utils.py
 create mode 100644 ptypy/accelerate/cuda_cupy/cufft.py
 create mode 100644 ptypy/accelerate/cuda_cupy/dependencies.yml
 create mode 100644 ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
 create mode 100644 ptypy/accelerate/cuda_cupy/engines/__init__.py
 create mode 100644 ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
 create mode 100644 ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
 create mode 100644 ptypy/accelerate/cuda_cupy/engines/stochastic.py
 create mode 100644 ptypy/accelerate/cuda_cupy/kernels.py
 create mode 100644 ptypy/accelerate/cuda_cupy/mem_utils.py
 create mode 100644 ptypy/accelerate/cuda_cupy/multi_gpu.py
 create mode 100644 ptypy/accelerate/cuda_cupy/porting_notes.md
 create mode 100644 templates/accelerate/ptypy_minimal_prep_and_run_cupy.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/__init__.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/array_utils_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/engine_tests.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/gpudata_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/import_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py

diff --git a/.gitignore b/.gitignore
index 70e473b91..3bebd583d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ ghostdriver*
 .DS_Store
 .ipynb_checkpoints
 .clang-format
+pip-wheel-metadata/
diff --git a/cufft/extensions.py b/cufft/extensions.py
index 4fabf2d2c..545b43d04 100644
--- a/cufft/extensions.py
+++ b/cufft/extensions.py
@@ -4,7 +4,6 @@
 import os, re
 import subprocess
 import sysconfig
-import pybind11
 from distutils.unixccompiler import UnixCCompiler
 from distutils.command.build_ext import build_ext
 
@@ -98,6 +97,7 @@ def __init__(self, *args, **kwargs):
         self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"]
         self.NVCC_FLAGS = ["-dc", archflag]
         self.CXXFLAGS = ['"-fPIC"']
+        import pybind11
         pybind_includes = [pybind11.get_include(), sysconfig.get_path('include')]  
         INCLUDES = pybind_includes + [self.CUDA['lib64'], module_dir]
         self.INCLUDES = ["-I%s" % ix for ix in INCLUDES]
diff --git a/cufft/setup.py b/cufft/setup.py
index 8cba2f560..5108ebf32 100644
--- a/cufft/setup.py
+++ b/cufft/setup.py
@@ -39,6 +39,7 @@
     description='Extension of CuFFT to include pre- and post-filters using callbacks',
     packages=package_list,
     ext_modules=ext_modules,
+    install_requires=["pybind11"],
     cmdclass=cmdclass
 )
 
diff --git a/ptypy/__init__.py b/ptypy/__init__.py
index 0ca662fb7..5b34c35fa 100644
--- a/ptypy/__init__.py
+++ b/ptypy/__init__.py
@@ -78,11 +78,16 @@
 
 # Convenience loader for GPU engines
 def load_gpu_engines(arch='cuda'):
-    if arch=='cuda':
+    if arch in ['cuda', 'pycuda']:
         from .accelerate.cuda_pycuda.engines import projectional_pycuda
         from .accelerate.cuda_pycuda.engines import projectional_pycuda_stream
         from .accelerate.cuda_pycuda.engines import stochastic
         from .accelerate.cuda_pycuda.engines import ML_pycuda
+    if arch=='cupy':
+        from .accelerate.cuda_cupy.engines import projectional_cupy
+        from .accelerate.cuda_cupy.engines import projectional_cupy_stream
+        from .accelerate.cuda_cupy.engines import stochastic
+        from .accelerate.cuda_cupy.engines import ML_cupy
     if arch=='serial':
         from .accelerate.base.engines import projectional_serial
         from .accelerate.base.engines import projectional_serial_stream
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/__init__.py b/ptypy/accelerate/cuda_common/__init__.py
similarity index 100%
rename from ptypy/accelerate/cuda_pycuda/cuda/__init__.py
rename to ptypy/accelerate/cuda_common/__init__.py
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu b/ptypy/accelerate/cuda_common/abs2sum.cu
similarity index 94%
rename from ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu
rename to ptypy/accelerate/cuda_common/abs2sum.cu
index 475a228bb..9783c20cc 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu
+++ b/ptypy/accelerate/cuda_common/abs2sum.cu
@@ -5,8 +5,7 @@
  * - OUT_TYPE: can be float/double
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void abs2sum(const IN_TYPE* a,
                                    const int n,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu
similarity index 95%
rename from ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu
rename to ptypy/accelerate/cuda_common/batched_multiply.cu
index 1263841b6..f91bb6d38 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu
+++ b/ptypy/accelerate/cuda_common/batched_multiply.cu
@@ -8,8 +8,7 @@
  * - MATH_TYPE: the data type used for computation (filter)
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void batched_multiply(const complex<IN_TYPE>* input,
                                             complex<OUT_TYPE>* output,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu b/ptypy/accelerate/cuda_common/build_aux.cu
similarity index 100%
rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu
rename to ptypy/accelerate/cuda_common/build_aux.cu
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu b/ptypy/accelerate/cuda_common/build_aux_no_ex.cu
similarity index 98%
rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu
rename to ptypy/accelerate/cuda_common/build_aux_no_ex.cu
index ee091c58e..d02500a1a 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu
+++ b/ptypy/accelerate/cuda_common/build_aux_no_ex.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation 
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void build_aux_no_ex(complex<OUT_TYPE>* auxilliary_wave,
                                            int aRows,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu b/ptypy/accelerate/cuda_common/build_aux_position_correction.cu
similarity index 96%
rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu
rename to ptypy/accelerate/cuda_common/build_aux_position_correction.cu
index 327040371..9d0f44fad 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu
+++ b/ptypy/accelerate/cuda_common/build_aux_position_correction.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation 
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void build_aux_position_correction(
     complex<OUT_TYPE>* auxiliary_wave,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu b/ptypy/accelerate/cuda_common/build_exit.cu
similarity index 100%
rename from ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu
rename to ptypy/accelerate/cuda_common/build_exit.cu
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu b/ptypy/accelerate/cuda_common/build_exit_alpha_tau.cu
similarity index 100%
rename from ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu
rename to ptypy/accelerate/cuda_common/build_exit_alpha_tau.cu
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu b/ptypy/accelerate/cuda_common/clip_magnitudes.cu
similarity index 86%
rename from ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu
rename to ptypy/accelerate/cuda_common/clip_magnitudes.cu
index 8128091f9..5db29dbe9 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu
+++ b/ptypy/accelerate/cuda_common/clip_magnitudes.cu
@@ -1,10 +1,7 @@
 /** clip_magnitudes.
  *
  */
- #include <cassert>
- #include <cmath>
- #include <thrust/complex.h>
- using thrust::complex;
+ #include "common.cuh"
  
  extern "C" __global__ void clip_magnitudes(IN_TYPE *arr,
                                             float clip_min,
diff --git a/ptypy/accelerate/cuda_common/common.cuh b/ptypy/accelerate/cuda_common/common.cuh
new file mode 100644
index 000000000..d2c022373
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/common.cuh
@@ -0,0 +1,12 @@
+#pragma once
+
+#ifndef PTYPY_CUPY_NVTRC
+// pycuda code
+#  include <thrust/complex.h>
+using thrust::complex;
+
+#else
+// cupy code
+#  include <cupy/complex.cuh>
+
+#endif
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu b/ptypy/accelerate/cuda_common/convolution.cu
similarity index 99%
rename from ptypy/accelerate/cuda_pycuda/cuda/convolution.cu
rename to ptypy/accelerate/cuda_common/convolution.cu
index ae42ecba5..d729fd067 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu
+++ b/ptypy/accelerate/cuda_common/convolution.cu
@@ -6,8 +6,7 @@
  * A symmetric convolution kernel is assumed here
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 /** Implements reflect-mode index wrapping
  *
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx.cu b/ptypy/accelerate/cuda_common/delx.cu
similarity index 99%
rename from ptypy/accelerate/cuda_pycuda/cuda/delx.cu
rename to ptypy/accelerate/cuda_common/delx.cu
index f2e8a934e..23ce09f05 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/delx.cu
+++ b/ptypy/accelerate/cuda_common/delx.cu
@@ -5,8 +5,7 @@
  * - OUT_TYPE: the data type for the outputs 
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 
 /** Finite difference for forward/backward for any axis that is not the
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu b/ptypy/accelerate/cuda_common/dot.cu
similarity index 93%
rename from ptypy/accelerate/cuda_pycuda/cuda/dot.cu
rename to ptypy/accelerate/cuda_common/dot.cu
index 21087abe3..3dfd909cf 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu
+++ b/ptypy/accelerate/cuda_common/dot.cu
@@ -1,6 +1,4 @@
-#include <cmath>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T>
 __device__ inline T dotmul(const T& a, const T& b)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu b/ptypy/accelerate/cuda_common/error_reduce.cu
similarity index 100%
rename from ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu
rename to ptypy/accelerate/cuda_common/error_reduce.cu
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_common/exit_error.cu
similarity index 91%
rename from ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
rename to ptypy/accelerate/cuda_common/exit_error.cu
index fdac52e46..2dded7e0f 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu
+++ b/ptypy/accelerate/cuda_common/exit_error.cu
@@ -1,9 +1,4 @@
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::abs;
-using thrust::complex;
+#include "common.cuh"
 
 // specify max number of threads/block and min number of blocks per SM,
 // to assist the compiler in register optimisations.
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu b/ptypy/accelerate/cuda_common/fill3D.cu
similarity index 95%
rename from ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu
rename to ptypy/accelerate/cuda_common/fill3D.cu
index c3f03d8ca..ddaf6b2bc 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu
+++ b/ptypy/accelerate/cuda_common/fill3D.cu
@@ -5,9 +5,7 @@
  * - OUT_TYPE: data type for outputs 
  */
 
-#include <cmath>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void fill3D(
     OUT_TYPE* A,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu b/ptypy/accelerate/cuda_common/fill_b.cu
similarity index 100%
rename from ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu
rename to ptypy/accelerate/cuda_common/fill_b.cu
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_common/fmag_all_update.cu
similarity index 95%
rename from ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
rename to ptypy/accelerate/cuda_common/fmag_all_update.cu
index f8f695ca5..42d217a67 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu
+++ b/ptypy/accelerate/cuda_common/fmag_all_update.cu
@@ -6,10 +6,7 @@
  * - MATH_TYPE: the data type used for computation 
  */
 
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void fmag_all_update(complex<OUT_TYPE>* f,
                                            const IN_TYPE* fmask,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_common/fmag_update_nopbound.cu
similarity index 95%
rename from ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu
rename to ptypy/accelerate/cuda_common/fmag_update_nopbound.cu
index 40a65c172..89e65450b 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu
+++ b/ptypy/accelerate/cuda_common/fmag_update_nopbound.cu
@@ -6,10 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void fmag_update_nopbound(complex<OUT_TYPE>* f,
                                                 const IN_TYPE* fmask,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_common/fourier_deviation.cu
similarity index 93%
rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu
rename to ptypy/accelerate/cuda_common/fourier_deviation.cu
index 3427222c3..1548094e9 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu
+++ b/ptypy/accelerate/cuda_common/fourier_deviation.cu
@@ -6,12 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::abs;
-using thrust::complex;
+#include "common.cuh"
 
 // specify max number of threads/block and min number of blocks per SM,
 // to assist the compiler in register optimisations.
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_common/fourier_error.cu
similarity index 93%
rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
rename to ptypy/accelerate/cuda_common/fourier_error.cu
index ad483c870..43b4e5208 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu
+++ b/ptypy/accelerate/cuda_common/fourier_error.cu
@@ -7,12 +7,7 @@
  */
 
 
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::abs;
-using thrust::complex;
+#include "common.cuh"
 
 // specify max number of threads/block and min number of blocks per SM,
 // to assist the compiler in register optimisations.
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu b/ptypy/accelerate/cuda_common/fourier_error2.cu
similarity index 96%
rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu
rename to ptypy/accelerate/cuda_common/fourier_error2.cu
index 86dddf549..36a80c377 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu
+++ b/ptypy/accelerate/cuda_common/fourier_error2.cu
@@ -2,10 +2,7 @@
  * the modes. It turned out to run about 2x slower than the one without
  * shared memory, so it's not used at this stage.
  */
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void fourier_error2(int nmodes,
                                           complex<float> *f,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu b/ptypy/accelerate/cuda_common/fourier_update.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu
rename to ptypy/accelerate/cuda_common/fourier_update.cu
index a713c4418..5be874424 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu
+++ b/ptypy/accelerate/cuda_common/fourier_update.cu
@@ -6,10 +6,7 @@ is 2x slower than individual as we have many idle threads here.
 It is not used at the moment.
 */
 
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void fourier_update(int nmodes,
                                           complex<float> *f_d,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu b/ptypy/accelerate/cuda_common/full_reduce.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu
rename to ptypy/accelerate/cuda_common/full_reduce.cu
index 801204aaa..7f53a4b2e 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu
+++ b/ptypy/accelerate/cuda_common/full_reduce.cu
@@ -7,8 +7,6 @@
  */
 
 
-#include <cassert>
-
 extern "C" __global__ void full_reduce(const IN_TYPE* in, OUT_TYPE* out, int size)
 {
   assert(gridDim.x == 1);
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu b/ptypy/accelerate/cuda_common/gd_main.cu
similarity index 95%
rename from ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu
rename to ptypy/accelerate/cuda_common/gd_main.cu
index 1ab643c4c..461e103ae 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu
+++ b/ptypy/accelerate/cuda_common/gd_main.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation 
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void gd_main(const IN_TYPE* Imodel,
                                    const IN_TYPE* I,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu b/ptypy/accelerate/cuda_common/get_address.cu
similarity index 93%
rename from ptypy/accelerate/cuda_pycuda/cuda/get_address.cu
rename to ptypy/accelerate/cuda_common/get_address.cu
index dda9b45f1..4c42d295b 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu
+++ b/ptypy/accelerate/cuda_common/get_address.cu
@@ -1,6 +1,4 @@
-#include <thrust/complex.h>
-#include <cassert>
-using thrust::complex;
+#include "common.cuh"
 
 inline __device__ int minimum(int a, int b) { return a < b ? a : b; }
 
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu b/ptypy/accelerate/cuda_common/intens_renorm.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu
rename to ptypy/accelerate/cuda_common/intens_renorm.cu
index d0033f7f4..4acd11cf1 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu
+++ b/ptypy/accelerate/cuda_common/intens_renorm.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation 
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void step1(const IN_TYPE* Imodel,
                                  const IN_TYPE* I,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu b/ptypy/accelerate/cuda_common/interpolated_shift.cu
similarity index 98%
rename from ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu
rename to ptypy/accelerate/cuda_common/interpolated_shift.cu
index 49db445f7..23acce6f9 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu
+++ b/ptypy/accelerate/cuda_common/interpolated_shift.cu
@@ -1,10 +1,4 @@
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 __device__ inline complex<float>& ascomplex(float2& f2)
 {
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_common/log_likelihood.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
rename to ptypy/accelerate/cuda_common/log_likelihood.cu
index 075d59f0a..c488f8b69 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu
+++ b/ptypy/accelerate/cuda_common/log_likelihood.cu
@@ -6,12 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::abs;
-using thrust::complex;
+#include "common.cuh"
 
 // specify max number of threads/block and min number of blocks per SM,
 // to assist the compiler in register optimisations.
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu b/ptypy/accelerate/cuda_common/make_a012.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu
rename to ptypy/accelerate/cuda_common/make_a012.cu
index 11ba29f62..760b28913 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu
+++ b/ptypy/accelerate/cuda_common/make_a012.cu
@@ -7,8 +7,7 @@
  * - ACC_TYPE: data type used for accumulation
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void make_a012(const complex<IN_TYPE>* f,
                                      const complex<IN_TYPE>* a,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu b/ptypy/accelerate/cuda_common/make_aux.cu
similarity index 98%
rename from ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu
rename to ptypy/accelerate/cuda_common/make_aux.cu
index b2f64ba1d..fde2f7812 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu
+++ b/ptypy/accelerate/cuda_common/make_aux.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 // core calculation function - used by both kernels and inlined
 inline __device__ complex<MATH_TYPE> calculate(
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu b/ptypy/accelerate/cuda_common/make_exit.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu
rename to ptypy/accelerate/cuda_common/make_exit.cu
index 956b292dc..e8613da10 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu
+++ b/ptypy/accelerate/cuda_common/make_exit.cu
@@ -7,8 +7,7 @@
  */
 
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T>
 __device__ inline void atomicAdd(complex<T>* x, complex<T> y)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu b/ptypy/accelerate/cuda_common/make_model.cu
similarity index 93%
rename from ptypy/accelerate/cuda_pycuda/cuda/make_model.cu
rename to ptypy/accelerate/cuda_common/make_model.cu
index 22bf7d4ab..727388d65 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu
+++ b/ptypy/accelerate/cuda_common/make_model.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation 
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void make_model(
     const complex<IN_TYPE>* in, OUT_TYPE* out, int z, int y, int x)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/mass_center.cu b/ptypy/accelerate/cuda_common/mass_center.cu
similarity index 100%
rename from ptypy/accelerate/cuda_pycuda/cuda/mass_center.cu
rename to ptypy/accelerate/cuda_common/mass_center.cu
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu b/ptypy/accelerate/cuda_common/max_abs2.cu
similarity index 96%
rename from ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu
rename to ptypy/accelerate/cuda_common/max_abs2.cu
index 4da8efb3e..1780bc268 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu
+++ b/ptypy/accelerate/cuda_common/max_abs2.cu
@@ -5,10 +5,7 @@
  * - IN_TYPE: can be float/double or complex<float>/complex<double>
  */
 
-#include <cmath>
-#include <thrust/complex.h>
-using thrust::complex;
-using thrust::norm;
+#include "common.cuh"
 
 inline __device__ OUT_TYPE norm(const float& in) {
     return in*in;
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu b/ptypy/accelerate/cuda_common/ob_norm_local.cu
similarity index 93%
rename from ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu
rename to ptypy/accelerate/cuda_common/ob_norm_local.cu
index 3969ea6e9..9d14cae6d 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu
+++ b/ptypy/accelerate/cuda_common/ob_norm_local.cu
@@ -6,12 +6,7 @@
 * - MATH_TYPE: the data type used for computation
 */
 
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::abs;
-using thrust::complex;
+#include "common.cuh"
 
 // specify max number of threads/block and min number of blocks per SM,
 // to assist the compiler in register optimisations.
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_common/ob_update.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu
rename to ptypy/accelerate/cuda_common/ob_update.cu
index 29b993fb0..7bf8dddd9 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu
+++ b/ptypy/accelerate/cuda_common/ob_update.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T>
 __device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_common/ob_update2.cu
similarity index 98%
rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu
rename to ptypy/accelerate/cuda_common/ob_update2.cu
index 821c04a6d..1e9717b81 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu
+++ b/ptypy/accelerate/cuda_common/ob_update2.cu
@@ -15,9 +15,7 @@
  */
 
 
-#include <cassert>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 #define pr_dlayer(k) addr[(k)]
 #define ex_dlayer(k) addr[6 * num_pods + (k)]
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu b/ptypy/accelerate/cuda_common/ob_update2_ML.cu
similarity index 98%
rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu
rename to ptypy/accelerate/cuda_common/ob_update2_ML.cu
index b62e66006..8840457c0 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu
+++ b/ptypy/accelerate/cuda_common/ob_update2_ML.cu
@@ -15,9 +15,7 @@
  */
 
 
-#include <cassert>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 #define pr_dlayer(k) addr[(k)]
 #define ex_dlayer(k) addr[6 * num_pods + (k)]
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu b/ptypy/accelerate/cuda_common/ob_update_ML.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu
rename to ptypy/accelerate/cuda_common/ob_update_ML.cu
index 84e678ebb..3a20024f9 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu
+++ b/ptypy/accelerate/cuda_common/ob_update_ML.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T>
 __device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu b/ptypy/accelerate/cuda_common/ob_update_local.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu
rename to ptypy/accelerate/cuda_common/ob_update_local.cu
index b3a955868..9ff5b73e6 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu
+++ b/ptypy/accelerate/cuda_common/ob_update_local.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T>
 __device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu b/ptypy/accelerate/cuda_common/pr_norm_local.cu
similarity index 93%
rename from ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu
rename to ptypy/accelerate/cuda_common/pr_norm_local.cu
index 6e9a8ea76..a89e8f842 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu
+++ b/ptypy/accelerate/cuda_common/pr_norm_local.cu
@@ -6,12 +6,7 @@
 * - MATH_TYPE: the data type used for computation
 */
 
-#include <cassert>
-#include <cmath>
-#include <thrust/complex.h>
-using std::sqrt;
-using thrust::abs;
-using thrust::complex;
+#include "common.cuh"
 
 // specify max number of threads/block and min number of blocks per SM,
 // to assist the compiler in register optimisations.
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_common/pr_update.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu
rename to ptypy/accelerate/cuda_common/pr_update.cu
index 180cf8f14..d7739a569 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu
+++ b/ptypy/accelerate/cuda_common/pr_update.cu
@@ -6,8 +6,7 @@
  * - MATH_TYPE: the data type used for computation
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T, class U>
 __device__ inline void atomicAdd(complex<T>* x, const complex<U>& y)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu b/ptypy/accelerate/cuda_common/pr_update2.cu
similarity index 98%
rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu
rename to ptypy/accelerate/cuda_common/pr_update2.cu
index e5417cc01..09913bc02 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu
+++ b/ptypy/accelerate/cuda_common/pr_update2.cu
@@ -14,9 +14,7 @@
  * and the kernel will get considerably slower.
  */
 
-#include <cassert>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 #define pr_dlayer(k) addr[(k)]
 #define pr_roi_row(k) addr[1 * num_pods + (k)]
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu b/ptypy/accelerate/cuda_common/pr_update2_ML.cu
similarity index 98%
rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu
rename to ptypy/accelerate/cuda_common/pr_update2_ML.cu
index 8a45891c5..167610ea6 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu
+++ b/ptypy/accelerate/cuda_common/pr_update2_ML.cu
@@ -14,9 +14,7 @@
  * and the kernel will get considerably slower.
  */
 
-#include <cassert>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 #define pr_dlayer(k) addr[(k)]
 #define pr_roi_row(k) addr[1 * num_pods + (k)]
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu b/ptypy/accelerate/cuda_common/pr_update_ML.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu
rename to ptypy/accelerate/cuda_common/pr_update_ML.cu
index 3fa24137d..ad32dfe8a 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu
+++ b/ptypy/accelerate/cuda_common/pr_update_ML.cu
@@ -7,8 +7,7 @@
  */
 
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T>
 __device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu b/ptypy/accelerate/cuda_common/pr_update_local.cu
similarity index 97%
rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu
rename to ptypy/accelerate/cuda_common/pr_update_local.cu
index d515afd55..cf221aadd 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu
+++ b/ptypy/accelerate/cuda_common/pr_update_local.cu
@@ -7,8 +7,7 @@
  * - ACC_TYPE: data type used in norm calculation (input here)
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 template <class T, class U>
 __device__ inline void atomicAdd(complex<T>* x, const complex<U>& y)
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu b/ptypy/accelerate/cuda_common/transpose.cu
similarity index 96%
rename from ptypy/accelerate/cuda_pycuda/cuda/transpose.cu
rename to ptypy/accelerate/cuda_common/transpose.cu
index 8de4e7ad7..f00be8937 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu
+++ b/ptypy/accelerate/cuda_common/transpose.cu
@@ -10,8 +10,7 @@
  * - DTYPE - any pod type
  */
 
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void transpose(const DTYPE* idata,
                                      DTYPE* odata,
diff --git a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu b/ptypy/accelerate/cuda_common/update_addr_error_state.cu
similarity index 94%
rename from ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu
rename to ptypy/accelerate/cuda_common/update_addr_error_state.cu
index 1220a0986..e4045a38b 100644
--- a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu
+++ b/ptypy/accelerate/cuda_common/update_addr_error_state.cu
@@ -5,9 +5,7 @@
  * - OUT_TYPE: the data type for the outputs (float or double)
  */
 
-#include <cassert>
-#include <thrust/complex.h>
-using thrust::complex;
+#include "common.cuh"
 
 extern "C" __global__ void update_addr_error_state(int* __restrict addr,
                                                    const int* __restrict mangled_addr,
diff --git a/ptypy/accelerate/cuda_common/utils.py b/ptypy/accelerate/cuda_common/utils.py
new file mode 100644
index 000000000..a953bfdb4
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/utils.py
@@ -0,0 +1,18 @@
+import numpy as np
+
+# maps a numpy dtype to the corresponding C type
+def map2ctype(dt):
+    if dt == np.float32:
+        return 'float'
+    elif dt == np.float64:
+        return 'double'
+    elif dt == np.complex64:
+        return 'complex<float>'
+    elif dt == np.complex128:
+        return 'complex<double>'
+    elif dt == np.int32:
+        return 'int'
+    elif dt == np.int64:
+        return 'long long'
+    else:
+        raise ValueError('No mapping for {}'.format(dt))
diff --git a/ptypy/accelerate/cuda_cupy/__init__.py b/ptypy/accelerate/cuda_cupy/__init__.py
new file mode 100644
index 000000000..717878241
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/__init__.py
@@ -0,0 +1,73 @@
+
+from typing import Optional
+import cupy as cp
+import os
+
+from ptypy.utils.verbose import headerline, log
+
+kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common'))
+compile_options =['-std=c++14', '-DPTYPY_CUPY_NVTRC=1', '-I' + kernel_dir, '-DNDEBUG']
+queue = None
+device = None
+
+
+def get_context(new_queue=False):
+
+    from ptypy.utils import parallel
+
+    global queue, device
+
+    if queue is None or new_queue:
+        ndevs = cp.cuda.runtime.getDeviceCount()
+        if parallel.rank_local >= ndevs:
+            raise Exception('Local rank must be smaller than total device count, \
+                rank={}, rank_local={}, device_count={}'.format(
+                parallel.rank, parallel.rank_local, ndevs
+            ))
+        device = cp.cuda.Device(parallel.rank_local)
+        device.use()
+        queue = cp.cuda.Stream()
+
+    return queue
+
+
+def load_kernel(name, subs={}, file=None, options=None):
+
+    if file is None:
+        if isinstance(name, str):
+            fn = "%s/%s.cu" % (kernel_dir, name)
+        else:
+            raise ValueError(
+                "name parameter must be a string if not filename is given")
+    else:
+        fn = "%s/%s" % (kernel_dir, file)
+
+    with open(fn, 'r') as f:
+        kernel = f.read()
+    for k, v in list(subs.items()):
+        kernel = kernel.replace(k, str(v))
+    # insert a preprocessor line directive to assist compiler errors
+    escaped = fn.replace("\\", "\\\\")
+    kernel = '#line 1 "{}"\n'.format(escaped) + kernel
+
+    opt = [*compile_options]
+    if options is not None:
+        opt += list(options)
+    module = cp.RawModule(code=kernel, options=tuple(opt))
+    if isinstance(name, str):
+        return module.get_function(name)
+    else:  # tuple
+        return tuple(module.get_function(n) for n in name)
+
+def log_device_memory_stats(level=4, heading: str ='Device Memory Stats'):
+        mempool = cp.get_default_memory_pool()
+        pinned_pool = cp.get_default_pinned_memory_pool()
+        log(level, '\n' + headerline(heading))
+        log(level, f'Device id             : {cp.cuda.Device().id}')
+        log(level, f'Total device mem      : {cp.cuda.runtime.memGetInfo()[1]/1024/1024} MB')
+        log(level, f'Free device mem       : {cp.cuda.runtime.memGetInfo()[0]/1024/1024} MB')
+        log(level, f'MemoryPool size       : {mempool.total_bytes()/1024/1024} MB')
+        log(level, f'MemoryPool used       : {mempool.used_bytes()/1024/1024} MB')
+        log(level, f'MemoryPool limit      : {mempool.get_limit()/1024/1024} MB')
+        log(level, f'MemoryPool free blocks: {mempool.n_free_blocks()}')
+        log(level, f'PinnedPool free blocks: {pinned_pool.n_free_blocks()}')
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_cupy/address_manglers.py b/ptypy/accelerate/cuda_cupy/address_manglers.py
new file mode 100644
index 000000000..ae4eeadbe
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/address_manglers.py
@@ -0,0 +1,81 @@
+from . import load_kernel
+import numpy as np
+from ptypy.accelerate.base import address_manglers as npam
+import cupy as cp
+
+
+class BaseMangler(npam.BaseMangler):
+
+    def __init__(self, *args, queue_thread=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.queue = queue_thread
+        self.get_address_cuda = load_kernel("get_address")
+        self.delta = None
+        self.delta_gpu = None
+
+    def _setup_delta_gpu(self):
+        if self.queue is not None:
+            self.queue.use()
+        assert self.delta is not None, "Setup delta using the setup_shifts method first"
+        self.delta = np.ascontiguousarray(self.delta, dtype=np.int32)
+        
+        if self.delta_gpu is None or self.delta_gpu.shape[0] < self.delta.shape[0]:
+            self.delta_gpu = cp.empty(self.delta.shape, dtype=np.int32)
+        # in case self.delta is smaller than delta_gpu, this will only copy the
+        # relevant part
+        cp.cuda.runtime.memcpy(dst=self.delta_gpu.data.ptr,
+                               src=self.delta.ctypes.data,
+                               size=self.delta.size * self.delta.itemsize,
+                               kind=1) # host to device
+        
+
+    def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx):
+        assert addr_current.dtype == np.int32, "addresses must be int32"
+        assert mangled_addr.dtype == np.int32, "addresses must be int32"
+        assert len(addr_current.shape) == 4, "addresses must be 4 dimensions"
+        assert addr_current.shape == mangled_addr.shape, "output addresses must be pre-allocated"
+        assert self.delta_gpu is not None, "Deltas are not set yet - call setup_shifts first"
+        assert index < self.delta_gpu.shape[0], "Index out of range for deltas"
+        assert isinstance(self.delta_gpu, cp.ndarray), "Only GPU arrays are supported for delta"
+        
+        if self.queue is not None:
+            self.queue.use()
+
+        # only using a single thread block here as it's not enough work
+        # otherwise
+        self.get_address_cuda(
+            (1, 1, 1),
+            (64, 1, 1),
+            (addr_current,
+            mangled_addr,
+            np.int32(addr_current.shape[0] * addr_current.shape[1]),
+            self.delta_gpu[index,None],
+            np.int32(max_oby),
+            np.int32(max_obx)))
+
+# with multiple inheritance, we have to be explicit which super class 
+# we are calling in the methods
+class RandomIntMangler(BaseMangler, npam.RandomIntMangler):
+
+    def __init__(self, *args, **kwargs):
+        BaseMangler.__init__(self, *args, **kwargs)
+
+    def setup_shifts(self, *args, **kwargs):
+        npam.RandomIntMangler.setup_shifts(self, *args, **kwargs)
+        self._setup_delta_gpu()
+
+    def get_address(self, *args, **kwargs):
+        BaseMangler.get_address(self, *args, **kwargs)
+
+
+class GridSearchMangler(BaseMangler, npam.GridSearchMangler):
+
+    def __init__(self, *args, **kwargs):
+        BaseMangler.__init__(self, *args, **kwargs)
+
+    def setup_shifts(self, *args, **kwargs):
+        npam.GridSearchMangler.setup_shifts(self, *args, **kwargs)
+        self._setup_delta_gpu()
+
+    def get_address(self, *args, **kwargs):
+        BaseMangler.get_address(self, *args, **kwargs)
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py
new file mode 100644
index 000000000..911c6111d
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/array_utils.py
@@ -0,0 +1,670 @@
+import cupy as cp
+import numpy as np
+
+from ptypy.accelerate.cuda_common.utils import map2ctype
+from ptypy.utils.math_utils import gaussian
+from . import load_kernel
+
+
+class ArrayUtilsKernel:
+    def __init__(self, acc_dtype=cp.float64, queue=None):
+        self.queue = queue
+        self.acc_dtype = acc_dtype
+        # Note: cupy's ReductionKernel is far less efficient
+        self.cdot_cuda = load_kernel("dot", {
+            'IN_TYPE': 'complex<float>',
+            'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float'
+        })
+        self.dot_cuda = load_kernel("dot", {
+            'IN_TYPE': 'float',
+            'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float'
+        })
+        self.full_reduce_cuda = load_kernel("full_reduce", {
+            'IN_TYPE': 'double' if acc_dtype == np.float64 else 'float',
+            'OUT_TYPE': 'double' if acc_dtype == np.float64 else 'float',
+            'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float',
+            'BDIM_X': 1024
+        })
+        self.Ctmp = None
+
+    def dot(self, A: cp.ndarray, B: cp.ndarray, out: cp.ndarray = None) -> cp.ndarray:
+        assert A.dtype == B.dtype, "Input arrays must be of same data type"
+        assert A.size == B.size, "Input arrays must be of the same size"
+
+        if self.queue is not None:
+            self.queue.use()
+        if out is None:
+            out = cp.empty(1, dtype=self.acc_dtype)
+
+        block = (1024, 1, 1)
+        grid = (int((B.size + 1023) // 1024), 1, 1)
+        if self.acc_dtype == np.float32:
+            elsize = 4
+        elif self.acc_dtype == np.float64:
+            elsize = 8
+        if self.Ctmp is None or self.Ctmp.size < grid[0]:
+            self.Ctmp = cp.zeros((grid[0],), dtype=self.acc_dtype)
+        Ctmp = self.Ctmp
+        if grid[0] == 1:
+            Ctmp = out
+        if np.iscomplexobj(B):
+            self.cdot_cuda(grid, block, (A, B, np.int32(A.size), Ctmp),
+                           shared_mem=1024 * elsize)
+        else:
+            self.dot_cuda(grid, block, (A, B, np.int32(A.size), Ctmp),
+                          shared_mem=1024 * elsize)
+        if grid[0] > 1:
+            self.full_reduce_cuda((1, 1, 1), (1024, 1, 1), (self.Ctmp, out, np.int32(grid[0])),
+                                  shared_mem=elsize*1024)
+
+        return out
+
+    def norm2(self, A, out=None):
+        return self.dot(A, A, out)
+
+
+class TransposeKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        self.transpose_cuda = load_kernel("transpose", {
+            'DTYPE': 'int',
+            'BDIM': 16
+        })
+
+    def transpose(self, input, output):
+        # only for int at the moment (addr array), and 2D (reshape pls)
+        if len(input.shape) != 2:
+            raise ValueError(
+                "Only 2D tranpose is supported - reshape as desired")
+        if input.shape[0] != output.shape[1] or input.shape[1] != output.shape[0]:
+            raise ValueError("Input/Output must be of flipped shape")
+        if input.dtype != np.int32 or output.dtype != np.int32:
+            raise ValueError("Only int types are supported at the moment")
+
+        width = input.shape[1]
+        height = input.shape[0]
+        blk = (16, 16, 1)
+        grd = (
+            int((input.shape[1] + 15) // 16),
+            int((input.shape[0] + 15) // 16),
+            1
+        )
+        if self.queue is not None:
+            self.queue.use()
+        self.transpose_cuda(
+            grd, blk, (input, output, np.int32(width), np.int32(height)))
+
+
+class MaxAbs2Kernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        # we lazy-load this depending on the data types we get
+        self.max_abs2_cuda = {}
+
+    def max_abs2(self, X: cp.ndarray, out: cp.ndarray):
+        """ Calculate max(abs(x)**2) across the final 2 dimensions"""
+        rows = np.int32(X.shape[-2])
+        cols = np.int32(X.shape[-1])
+        firstdims = np.int32(np.prod(X.shape[:-2]))
+        gy = int(rows)
+        # lazy-loading, keeping scratch memory and both kernels in the same dictionary
+        bx = int(64)
+        version = '{},{},{}'.format(
+            map2ctype(X.dtype), map2ctype(out.dtype), gy)
+        if version not in self.max_abs2_cuda:
+            step1, step2 = load_kernel(
+                ("max_abs2_step1", "max_abs2_step2"),
+                {
+                    'IN_TYPE': map2ctype(X.dtype),
+                    'OUT_TYPE': map2ctype(out.dtype),
+                    'BDIM_X': bx,
+                }, "max_abs2.cu")
+            self.max_abs2_cuda[version] = {
+                'step1': step1,
+                'step2': step2,
+                'scratchmem': cp.empty((gy,), dtype=out.dtype)
+            }
+
+        # if self.max_abs2_cuda[version]['scratchmem'] is None \
+        #     or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy:
+        #     self.max_abs2_cuda[version]['scratchmem'] =
+        scratch = self.max_abs2_cuda[version]['scratchmem']
+
+        if self.queue is not None:
+            self.queue.use()
+
+        self.max_abs2_cuda[version]['step1'](
+            (1, gy, 1), (bx, 1, 1), (X, firstdims, rows, cols, scratch))
+        self.max_abs2_cuda[version]['step2'](
+            (1, 1, 1), (bx, 1, 1), (scratch, np.int32(gy), out))
+
+
+class CropPadKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        # we lazy-load this depending on the data types we get
+        self.fill3D_cuda = {}
+
+    def fill3D(self, A, B, offset=[0, 0, 0]):
+        """
+        Fill 3-dimensional array A with B.
+        """
+        if A.ndim < 3 or B.ndim < 3:
+            raise ValueError('Input arrays must each be at least 3D')
+        assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions."
+        ash = A.shape
+        bsh = B.shape
+        misfit = np.array(bsh) - np.array(ash)
+        assert not misfit[:-3].any(
+        ), "Input and Output must have the same shape everywhere but the last three axes."
+
+        Alim = np.array(A.shape[-3:])
+        Blim = np.array(B.shape[-3:])
+        off = np.array(offset)
+        Ao = off.copy()
+        Ao[Ao < 0] = 0
+        Bo = -off.copy()
+        Bo[Bo < 0] = 0
+        assert (Bo < Blim).all() and (Ao < Alim).all(
+        ), "At least one dimension lacks overlap"
+        Ao = Ao.astype(np.int32)
+        Bo = Bo.astype(np.int32)
+        lengths = np.array([
+            min(off[0] + Blim[0], Alim[0]) - Ao[0],
+            min(off[1] + Blim[1], Alim[1]) - Ao[1],
+            min(off[2] + Blim[2], Alim[2]) - Ao[2],
+        ], dtype=np.int32)
+        lengths2 = np.array([
+            min(Alim[0] - off[0], Blim[0]) - Bo[0],
+            min(Alim[1] - off[1], Blim[1]) - Bo[1],
+            min(Alim[2] - off[2], Blim[2]) - Bo[2],
+        ], dtype=np.int32)
+        assert (lengths == lengths2).all(
+        ), "left and right lenghts are not matching"
+        batch = int(np.prod(A.shape[:-3]))
+
+        # lazy loading depending on data type
+        version = '{},{}'.format(map2ctype(B.dtype), map2ctype(A.dtype))
+        if version not in self.fill3D_cuda:
+            self.fill3D_cuda[version] = load_kernel("fill3D", {
+                'IN_TYPE': map2ctype(B.dtype),
+                'OUT_TYPE': map2ctype(A.dtype)
+            })
+        bx = by = 32
+        if self.queue is not None:
+            self.queue.use()
+        self.fill3D_cuda[version](
+            (int((lengths[2] + bx - 1)//bx),
+             int((lengths[1] + by - 1)//by),
+             int(batch)),
+            (int(bx), int(by), int(1)),
+            (A, B,
+             np.int32(A.shape[-3]), np.int32(A.shape[-2]
+                                             ), np.int32(A.shape[-1]),
+             np.int32(B.shape[-3]), np.int32(B.shape[-2]
+                                             ), np.int32(B.shape[-1]),
+             Ao[0], Ao[1], Ao[2],
+             Bo[0], Bo[1], Bo[2],
+             lengths[0], lengths[1], lengths[2])
+        )
+
+    def crop_pad_2d_simple(self, A, B):
+        """
+        Places B in A centered around the last two axis. A and B must be of the same shape
+        anywhere but the last two dims.
+        """
+        assert A.ndim >= 2, "Arrays must have more than 2 dimensions."
+        assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions."
+        misfit = np.array(A.shape) - np.array(B.shape)
+        assert not misfit[:-2].any(
+        ), "Input and Output must have the same shape everywhere but the last two axes."
+        if A.ndim == 2:
+            A = A.reshape((1,) + A.shape)
+        if B.ndim == 2:
+            B = B.reshape((1,) + B.shape)
+        a1, a2 = A.shape[-2:]
+        b1, b2 = B.shape[-2:]
+        offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2]
+        self.fill3D(A, B, offset)
+
+
+class DerivativesKernel:
+    def __init__(self, dtype, queue=None):
+        if dtype == np.float32:
+            stype = "float"
+        elif dtype == np.complex64:
+            stype = "complex<float>"
+        else:
+            raise NotImplementedError(
+                "delxf is only implemented for float32 and complex64")
+
+        self.queue = queue
+        self.dtype = dtype
+        self.last_axis_block = (256, 4, 1)
+        self.mid_axis_block = (256, 4, 1)
+
+        self.delxf_last, self.delxf_mid = load_kernel(
+            ("delx_last", "delx_mid"),
+            file="delx.cu",
+            subs={
+                'IS_FORWARD': 'true',
+                'BDIM_X': str(self.last_axis_block[0]),
+                'BDIM_Y': str(self.last_axis_block[1]),
+                'IN_TYPE': stype,
+                'OUT_TYPE': stype
+            })
+        self.delxb_last, self.delxb_mid = load_kernel(
+            ("delx_last", "delx_mid"),
+            file="delx.cu",
+            subs={
+                'IS_FORWARD': 'false',
+                'BDIM_X': str(self.last_axis_block[0]),
+                'BDIM_Y': str(self.last_axis_block[1]),
+                'IN_TYPE': stype,
+                'OUT_TYPE': stype
+            })
+
+    def delxf(self, input, out, axis=-1):
+        if input.dtype != self.dtype:
+            raise ValueError('Invalid input data type')
+
+        if axis < 0:
+            axis = input.ndim + axis
+        axis = np.int32(axis)
+
+        if self.queue is not None:
+            self.queue.use()
+
+        if axis == input.ndim - 1:
+            flat_dim = np.int32(np.product(input.shape[0:-1]))
+            self.delxf_last((
+                int((flat_dim +
+                     self.last_axis_block[1] - 1) // self.last_axis_block[1]),
+                1, 1),
+                self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis])))
+        else:
+            lower_dim = np.int32(np.product(input.shape[(axis+1):]))
+            higher_dim = np.int32(np.product(input.shape[:axis]))
+            gx = int(
+                (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0])
+            gy = 1
+            gz = int(higher_dim)
+            self.delxf_mid((gx, gy, gz), self.mid_axis_block, (input,
+                           out, lower_dim, higher_dim, np.int32(input.shape[axis])))
+
+    def delxb(self, input, out, axis=-1):
+        if input.dtype != self.dtype:
+            raise ValueError('Invalid input data type')
+
+        if axis < 0:
+            axis = input.ndim + axis
+        axis = np.int32(axis)
+
+        if self.queue is not None:
+            self.queue.use()
+        if axis == input.ndim - 1:
+            flat_dim = np.int32(np.product(input.shape[0:-1]))
+            self.delxb_last((
+                int((flat_dim +
+                     self.last_axis_block[1] - 1) // self.last_axis_block[1]),
+                1, 1), self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis])))
+        else:
+            lower_dim = np.int32(np.product(input.shape[(axis+1):]))
+            higher_dim = np.int32(np.product(input.shape[:axis]))
+            gx = int(
+                (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0])
+            gy = 1
+            gz = int(higher_dim)
+            self.delxb_mid((gx, gy, gz), self.mid_axis_block, (input,
+                           out, lower_dim, higher_dim, np.int32(input.shape[axis])))
+
+
+class GaussianSmoothingKernel:
+    def __init__(self, queue=None, num_stdevs=4, kernel_type='float'):
+        if kernel_type not in ['float', 'double']:
+            raise ValueError('Invalid data type for kernel')
+        self.kernel_type = kernel_type
+        self.dtype = np.complex64
+        self.stype = "complex<float>"
+        self.queue = queue
+        self.num_stdevs = num_stdevs
+        self.blockdim_x = 4
+        self.blockdim_y = 16
+
+        # At least 2 blocks per SM
+        self.max_shared_per_block = 48 * 1024 // 2
+        self.max_shared_per_block_complex = self.max_shared_per_block / \
+            2 * np.dtype(np.float32).itemsize
+        self.max_kernel_radius = int(
+            self.max_shared_per_block_complex / self.blockdim_y)
+
+        self.convolution_row = load_kernel(
+            "convolution_row", file="convolution.cu", subs={
+                'BDIM_X': self.blockdim_x,
+                'BDIM_Y': self.blockdim_y,
+                'DTYPE': self.stype,
+                'MATH_TYPE': self.kernel_type
+            })
+        self.convolution_col = load_kernel(
+            "convolution_col", file="convolution.cu", subs={
+                'BDIM_X': self.blockdim_y,   # NOTE: we swap x and y in this columns
+                'BDIM_Y': self.blockdim_x,
+                'DTYPE': self.stype,
+                'MATH_TYPE': self.kernel_type
+            })
+        # pre-allocate kernel memory on gpu, with max-radius to accomodate
+        dtype = np.float32 if self.kernel_type == 'float' else np.float64
+        self.kernel_gpu = cp.empty((self.max_kernel_radius,), dtype=dtype)
+        # keep track of previus radius and std to determine if we need to transfer again
+        self.r = 0
+        self.std = 0
+
+    def convolution(self, data, mfs, tmp=None):
+        """
+        Calculates a stacked 2D convolution for smoothing, with the standard deviations
+        given in mfs (stdx, stdy). It works in-place in the data array,
+        and tmp is a gpu-allocated array of the same size and type as data,
+        used internally for temporary storage
+        """
+        ndims = data.ndim
+        shape = data.shape
+
+        # Create temporary array (if not given)
+        if tmp is None:
+            tmp = cp.empty(shape, dtype=data.dtype)
+        assert shape == tmp.shape and data.dtype == tmp.dtype
+
+        # Check input dimensions
+        if ndims == 3:
+            batches, y, x = shape
+            stdy, stdx = mfs
+        elif ndims == 2:
+            batches = 1
+            y, x = shape
+            stdy, stdx = mfs
+        elif ndims == 1:
+            batches = 1
+            y, x = shape[0], 1
+            stdy, stdx = mfs[0], 0.0
+        else:
+            raise NotImplementedError(
+                "input needs to be of dimensions 0 < ndims <= 3")
+
+        input = data
+        output = tmp
+
+        if self.queue is not None:
+            self.queue.use()
+
+        # Row convolution kernel
+        # TODO: is this threshold acceptable in all cases?
+        if stdx > 0.1:
+            r = int(self.num_stdevs * stdx + 0.5)
+            if r > self.max_kernel_radius:
+                raise ValueError("Size of Gaussian kernel too large")
+            if r != self.r or stdx != self.std:
+                # recalculate + transfer
+                g = gaussian(np.arange(-r, r+1), stdx)
+                g /= g.sum()
+                k = np.ascontiguousarray(g[r:].astype(
+                    np.float32 if self.kernel_type == 'float' else np.float64))
+                self.kernel_gpu[:r+1] = cp.asarray(k[:])
+                self.r = r
+                self.std = stdx
+
+            bx = self.blockdim_x
+            by = self.blockdim_y
+
+            shared = (bx + 2*r) * by * np.dtype(np.complex64).itemsize
+            if shared > self.max_shared_per_block:
+                raise MemoryError("Cannot run kernel in shared memory")
+
+            blk = (bx, by, 1)
+            grd = (int((y + bx - 1) // bx), int((x + by-1) // by), batches)
+            self.convolution_row(grd, blk, (input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r)),
+                                 shared_mem=shared)
+
+            input = output
+            output = data
+
+        # Column convolution kernel
+        # TODO: is this threshold acceptable in all cases?
+        if stdy > 0.1:
+            r = int(self.num_stdevs * stdy + 0.5)
+            if r > self.max_kernel_radius:
+                raise ValueError("Size of Gaussian kernel too large")
+            if r != self.r or stdy != self.std:
+                # recalculate + transfer
+                g = gaussian(np.arange(-r, r+1), stdy)
+                g /= g.sum()
+                k = np.ascontiguousarray(g[r:].astype(
+                    np.float32 if self.kernel_type == 'float' else np.float64))
+                self.kernel_gpu[:r+1] = cp.asarray(k[:])
+                self.r = r
+                self.std = stdy
+
+            bx = self.blockdim_y
+            by = self.blockdim_x
+
+            shared = (by + 2*r) * bx * np.dtype(np.complex64).itemsize
+            if shared > self.max_shared_per_block:
+                raise MemoryError("Cannot run kernel in shared memory")
+
+            blk = (bx, by, 1)
+            grd = (int((y + bx - 1) // bx), int((x + by-1) // by), batches)
+            self.convolution_col(grd, blk, (input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r)),
+                                 shared_mem=shared)
+
+        # TODO: is this threshold acceptable in all cases?
+        if (stdx <= 0.1 and stdy <= 0.1):
+            return   # nothing to do
+        elif (stdx > 0.1 and stdy > 0.1):
+            return   # both parts have run, output is back in data
+        else:
+            data[:] = tmp[:]  # only one of them has run, output is in tmp
+
+
+class ClipMagnitudesKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        self.clip_magnitudes_cuda = load_kernel("clip_magnitudes", {
+            'IN_TYPE': 'complex<float>',
+        })
+
+    def clip_magnitudes_to_range(self, array, clip_min, clip_max):
+        if self.queue is not None:
+            self.queue.use()
+
+        cmin = np.float32(clip_min)
+        cmax = np.float32(clip_max)
+
+        npixel = np.int32(np.prod(array.shape))
+        bx = 256
+        gx = int((npixel + bx - 1) // bx)
+        self.clip_magnitudes_cuda((gx, 1, 1), (bx, 1, 1), (array, cmin, cmax,
+                npixel))
+
+class MassCenterKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+        self.threadsPerBlock = 256
+
+        self.indexed_sum_middim_cuda = load_kernel("indexed_sum_middim",
+                file="mass_center.cu", subs={
+                    'IN_TYPE': 'float',
+                    'BDIM_X' : self.threadsPerBlock,
+                    'BDIM_Y' : 1,
+                    }
+                )
+
+        self.indexed_sum_lastdim_cuda = load_kernel("indexed_sum_lastdim",
+                file="mass_center.cu", subs={
+                    'IN_TYPE': 'float',
+                    'BDIM_X' : 32,
+                    'BDIM_Y' : 32,
+                    }
+                )
+
+        self.final_sums_cuda = load_kernel("final_sums",
+                file="mass_center.cu", subs={
+                    'IN_TYPE': 'float',
+                    'BDIM_X' : 256,
+                    'BDIM_Y' : 1,
+                    }
+                )
+
+    def mass_center(self, array):
+        if array.dtype != np.float32:
+            raise NotImplementedError("mass_center is only implemented for float32")
+
+        i = np.int32(array.shape[0])
+        m = np.int32(array.shape[1])
+        if array.ndim >= 3:
+            n = np.int32(array.shape[2])
+        else:
+            n = np.int32(1)
+
+        if self.queue is not None:
+            self.queue.use()
+
+        total_sum = cp.sum(array, dtype=np.float32).get()
+        sc = np.float32(1. / total_sum.item())
+
+        i_sum = cp.empty(array.shape[0], dtype=np.float32)
+        m_sum = cp.empty(array.shape[1], dtype=np.float32)
+        n_sum = cp.empty(int(n), dtype=np.float32)
+        out = cp.empty(3 if n>1 else 2, dtype=np.float32)
+
+        # sum all dims except the first, multiplying by the index and scaling factor
+        block_ = (self.threadsPerBlock, 1, 1)
+        grid_ = (int(i), 1, 1)
+        self.indexed_sum_middim_cuda(grid_, block_, (array, i_sum, np.int32(1), i, n*m, sc),
+                shared_mem=self.threadsPerBlock*4)
+
+        if array.ndim >= 3:
+            # 3d case
+            # sum all dims, except the middle, multiplying by the index and scaling factor
+            block_ = (self.threadsPerBlock, 1, 1)
+            grid_ = (int(m), 1, 1)
+            self.indexed_sum_middim_cuda(grid_, block_, (array, m_sum, i, n, m, sc),
+                    shared_mem=self.threadsPerBlock*4)
+
+            # sum the all dims except the last, multiplying by the index and scaling factor
+            block_ = (32, 32, 1)
+            grid_ = (1, int(n + 32 - 1) // 32, 1)
+            self.indexed_sum_lastdim_cuda(grid_, block_, (array, n_sum, i*m, n, sc),
+                    shared_mem=32*32*4)
+        else:
+            # 2d case
+            # sum the all dims except the last, multiplying by the index and scaling factor
+            block_ = (32, 32, 1)
+            grid_ = (1, int(m + 32 - 1) // 32, 1)
+            self.indexed_sum_lastdim_cuda(grid_, block_, (array, m_sum, i, m, sc),
+                    shared_mem=32*32*4)
+
+        block_ = (256, 1, 1)
+        grid_ = (3 if n>1 else 2, 1, 1)
+        self.final_sums_cuda(grid_, block_, (i_sum, i, m_sum, m, n_sum, n, out),
+                shared_mem=256*4)
+
+        return out
+
+class Abs2SumKernel:
+
+    def __init__(self, dtype, queue=None):
+        self.in_stype = map2ctype(dtype)
+        if self.in_stype == 'complex<float>':
+            self.out_stype = 'float'
+            self.out_dtype = np.float32
+        elif self.in_stype == 'copmlex<double>':
+            self.out_stype = 'double'
+            self.out_dtype = np.float64
+        else:
+            self.out_stype = self.in_stype
+            self.out_dtype = dtype
+
+        self.queue = queue
+        self.threadsPerBlock = 32
+
+        self.abs2sum_cuda = load_kernel("abs2sum", subs={
+                    'IN_TYPE': self.in_stype,
+                    'OUT_TYPE' : self.out_stype,
+                    'BDIM_X' : 32,
+                    }
+                )
+
+    def abs2sum(self, array):
+        nmodes = np.int32(array.shape[0])
+        row, col = array.shape[1:]
+        out = cp.empty(array.shape[1:], dtype=self.out_dtype)
+
+        if self.queue is not None:
+            self.queue.use()
+        block_ = (32, 1, 1)
+        grid_ = (1, row, 1)
+        self.abs2sum_cuda(grid_, block_, (array, nmodes, np.int32(row), np.int32(col), out))
+
+        return out
+
+class InterpolatedShiftKernel:
+
+    def __init__(self, queue=None):
+        self.queue = queue
+
+        self.integer_shift_cuda, self.linear_interpolate_cuda = load_kernel(
+                ("integer_shift_kernel", "linear_interpolate_kernel"),
+                file="interpolated_shift.cu", subs={
+                    'IN_TYPE': 'complex<float>',
+                    'OUT_TYPE': 'complex<float>',
+                    'BDIM_X' : 32,
+                    'BDIM_Y' : 32,
+                    }
+                )
+
+    def interpolate_shift(self, array, shift):
+        shift = np.asarray(shift, dtype=np.float32)
+        if len(shift) != 2:
+            raise NotImplementedError("Shift only applied to 2D array.")
+        if array.dtype != np.complex64:
+            raise NotImplementedError("Only complex single precision supported")
+        if array.ndim == 3:
+            items, rows, columns = array.shape
+        elif array.ndim == 2:
+            items, rows, columns = 1, *array.shape
+        else:
+            raise NotImplementedError("Only 2- or 3-dimensional arrays supported")
+
+        offsetRow, offsetCol = shift
+
+        offsetRowFrac, offsetRowInt = np.modf(offsetRow)
+        offsetColFrac, offsetColInt = np.modf(offsetCol)
+
+        if self.queue is not None:
+            self.queue.use()
+
+        out = cp.empty_like(array)
+        block_ = (32, 32, 1)
+        grid_ = ((rows + 31) // 32, (columns + 31) // 32, items)
+
+        if np.abs(offsetRowFrac) < 1e-6 and np.abs(offsetColFrac) < 1e-6:
+            if offsetRowInt == 0 and offsetColInt == 0:
+                # no transformation at all
+                out = array
+            else:
+                # no fractional part, so we can just use a shifted copy
+                self.integer_shift_cuda(grid_, block_, (array, out, np.int32(rows),
+                        np.int32(columns), np.int32(offsetRow),
+                        np.int32(offsetCol)))
+        else:
+            self.linear_interpolate_cuda(grid_, block_, (array, out, np.int32(rows),
+                    np.int32(columns), np.float32(offsetRow),
+                    np.float32(offsetCol)),
+                    shared_mem=(32+2)**2*8+32*(32+2)*8)
+
+        return out
+
diff --git a/ptypy/accelerate/cuda_cupy/cufft.py b/ptypy/accelerate/cuda_cupy/cufft.py
new file mode 100644
index 000000000..794efb858
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/cufft.py
@@ -0,0 +1,171 @@
+import cupy as cp
+from cupyx.scipy import fft as cuxfft
+from cupyx.scipy.fft import get_fft_plan
+from . import load_kernel
+import numpy as np
+
+
+class FFT_cuda(object):
+
+    def __init__(self, array, queue=None,
+                 inplace=False,
+                 pre_fft=None,
+                 post_fft=None,
+                 symmetric=True,
+                 forward=True):
+        self._queue = queue
+        dims = array.ndim
+        if dims < 2:
+            raise AssertionError('Input array must be at least 2-dimensional')
+        self.arr_shape = (array.shape[-2], array.shape[-1])
+        rows = self.arr_shape[0]
+        columns = self.arr_shape[1]
+        if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
+            raise ValueError(
+                "CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
+        self.batches = int(np.product(
+            array.shape[0:dims-2]) if dims > 2 else 1)
+        self.forward = forward
+
+        self._load(array, pre_fft, post_fft, symmetric, forward)
+
+    def _load(self, array, pre_fft, post_fft, symmetric, forward):
+        if pre_fft is not None:
+            self.pre_fft = cp.asarray(pre_fft)
+            self.pre_fft_ptr = self.pre_fft.data.ptr
+        else:
+            self.pre_fft_ptr = 0
+        if post_fft is not None:
+            self.post_fft = cp.asarray(post_fft)
+            self.post_fft_ptr = self.post_fft.data.ptr
+        else:
+            self.post_fft_ptr = 0
+
+        import filtered_cufft
+        self.fftobj = filtered_cufft.FilteredFFT(
+            self.batches,
+            self.arr_shape[0],
+            self.arr_shape[1],
+            symmetric,
+            forward,
+            self.pre_fft_ptr,
+            self.post_fft_ptr,
+            self._queue.ptr)
+        
+        self.ft = self._ft
+        self.ift = self._ift
+
+    @property
+    def queue(self):
+        return self._queue
+
+    @queue.setter
+    def queue(self, queue):
+        self._queue = queue
+        self.fftobj.queue = self._queue.ptr
+
+    def _ft(self, input, output):
+        self.fftobj.fft(input.data.ptr, output.data.ptr)
+
+    def _ift(self, input, output):
+        self.fftobj.ifft(input.data.ptr, output.data.ptr)
+
+
+class FFT_cupy(FFT_cuda):
+
+    @property
+    def queue(self):
+        return self._queue
+
+    @queue.setter
+    def queue(self, queue):
+        self._queue = queue
+
+    def _load(self, array, pre_fft, post_fft, symmetric, forward):
+        assert (array.dtype in [np.complex64, np.complex128])
+        assert (pre_fft.dtype in [
+                np.complex64, np.complex128] if pre_fft is not None else True)
+        assert (post_fft.dtype in [
+                np.complex64, np.complex128] if post_fft is not None else True)
+
+        math_type = 'float' if array.dtype == np.complex64 else 'double'
+        if pre_fft is not None:
+            math_type = 'float' if pre_fft.dtype == np.complex64 else 'double'
+        self.pre_fft_knl = load_kernel("batched_multiply", {
+            'MPY_DO_SCALE': 'false',
+            'MPY_DO_FILT': 'true',
+            'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'MATH_TYPE': math_type
+        }) if pre_fft is not None else None
+
+        math_type = 'float' if array.dtype == np.complex64 else 'double'
+        if post_fft is not None:
+            math_type = 'float' if post_fft.dtype == np.complex64 else 'double'
+        self.post_fft_knl = load_kernel("batched_multiply", {
+            'MPY_DO_SCALE': 'true' if (not forward and not symmetric) or symmetric else 'false',
+            'MPY_DO_FILT': 'true' if post_fft is not None else 'false',
+            'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double',
+            'MATH_TYPE': math_type
+        }) if (not (forward and not symmetric) or post_fft is not None) else None
+
+        self.block = (32, 32, 1)
+        self.grid = (
+            int((self.arr_shape[0] + 31) // 32),
+            int((self.arr_shape[1] + 31) // 32),
+            int(self.batches)
+        )
+        if self.queue is not None:
+            self.queue.use()
+        self.plan = get_fft_plan(array, self.arr_shape, axes=(-2, -1), value_type="C2C")
+        self.scale = 1.0
+        self.norm = 'ortho' if symmetric else 'backward'
+
+        if pre_fft is not None:
+            self.pre_fft = cp.asarray(pre_fft)
+        else:
+            self.pre_fft = np.intp(0)  # NULL
+        if post_fft is not None:
+            self.post_fft = cp.asarray(post_fft)
+        else:
+            self.post_fft = np.intp(0)
+
+        self.ft = self._ft
+        self.ift = self._ift
+
+    def _prefilt(self, x, y):
+        if self.pre_fft_knl:
+            self.pre_fft_knl(grid=self.grid,
+                             block=self.block,
+                             args=(x, y, self.pre_fft,
+                                   np.float32(self.scale),
+                                   np.int32(self.batches),
+                                   np.int32(self.arr_shape[0]),
+                                   np.int32(self.arr_shape[1])))
+        else:
+            y[:] = x[:]
+
+    def _postfilt(self, y):
+        if self.post_fft_knl:
+            assert self.post_fft is not None
+            assert self.scale is not None
+            self.post_fft_knl(grid=self.grid,
+                              block=self.block,
+                              args=(y, y, self.post_fft, np.float32(self.scale),
+                                    np.int32(self.batches),
+                                    np.int32(self.arr_shape[0]),
+                                    np.int32(self.arr_shape[1])))
+    def _ft(self, x, y):
+        if self.queue is not None:
+            self.queue.use()
+        self._prefilt(x, y)
+        cuxfft.fft2(y, axes=(-2, -1), plan=self.plan, overwrite_x=True, norm=self.norm)
+        self._postfilt(y)
+
+    def _ift(self, x, y):
+        if self.queue is not None:
+            self.queue.use()
+        self._prefilt(x, y)
+        cuxfft.ifft2(y, axes=(-2, -1), plan=self.plan, overwrite_x=True, norm=self.norm)
+        self._postfilt(y)
diff --git a/ptypy/accelerate/cuda_cupy/dependencies.yml b/ptypy/accelerate/cuda_cupy/dependencies.yml
new file mode 100644
index 000000000..cb7d31fce
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/dependencies.yml
@@ -0,0 +1,17 @@
+name: ptypy_cupy
+channels:
+  - conda-forge
+dependencies:
+  - python=3.9
+  - numpy
+  - scipy
+  - matplotlib
+  - h5py
+  - pyzmq
+  - mpi4py
+  - pillow
+  - pyfftw
+  - cupy
+  - cudatoolkit-dev
+  - pip
+  - compilers
\ No newline at end of file
diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
new file mode 100644
index 000000000..c3cb39c09
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
@@ -0,0 +1,804 @@
+# -*- coding: utf-8 -*-
+"""
+Maximum Likelihood reconstruction engine.
+
+TODO.
+
+  * Implement other regularizers
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import numpy as np
+import cupy as cp
+import cupyx
+
+from ptypy.engines import register
+from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from .. import get_context, log_device_memory_stats
+from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
+from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel
+from ..mem_utils import GpuDataManager
+
+#from ..mem_utils import GpuDataManager
+from ptypy.accelerate.base import address_manglers
+
+__all__ = ['ML_cupy']
+
+# can be used to limit the number of blocks, simulating that they don't fit
+MAX_BLOCKS = 99999
+# MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+
+
+@register()
+class ML_cupy(ML_serial):
+
+    """
+    Defaults:
+
+    [probe_update_cuda_atomics]
+    default = False
+    type = bool
+    help = For GPU, use the atomics version for probe update kernel
+
+    [object_update_cuda_atomics]
+    default = True
+    type = bool
+    help = For GPU, use the atomics version for object update kernel
+
+    [fft_lib]
+    default = cuda
+    type = str
+    help = Choose the cupy-compatible FFT module.
+    doc = One of:
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'cupy'`` : cupy using cufft (fast load, slowest compute due to additional store/load stages)
+    choices = 'cuda','cupy'
+    userlevel = 2
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Maximum likelihood reconstruction engine.
+        """
+        super().__init__(ptycho_parent, pars)
+
+    def engine_initialize(self):
+        """
+        Prepare for ML reconstruction.
+        """
+        self.queue = get_context(new_queue=True)
+
+        self.qu_htod = cp.cuda.Stream()
+        self.qu_dtoh = cp.cuda.Stream()
+
+        self.GSK = GaussianSmoothingKernel(queue=self.queue)
+        self.GSK.tmp = None
+
+        # Real/Fourier Support Kernel
+        self.RSK = {}
+        self.FSK = {}
+
+        super().engine_initialize()
+        # self._setup_kernels()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        AUK = ArrayUtilsKernel(queue=self.queue)
+        self._dot_kernel = AUK.dot
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = scan.max_frames_per_block
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                    scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (fpc * nmodes,) + tuple([int(s) for s in geo.shape])
+            aux = cp.zeros(ash, dtype=np.complex64)
+            kern.aux = aux
+            kern.a = cp.zeros(ash, dtype=np.complex64)
+            kern.b = cp.zeros(ash, dtype=np.complex64)
+
+            # setup kernels, one for each SCAN.
+            kern.GDK = GradientDescentKernel(
+                aux, nmodes, queue=self.queue, math_type="double")
+            kern.GDK.allocate()
+
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            kern.TK = TransposeKernel(queue=self.queue)
+
+            kern.PROP = PropagationKernel(
+                aux, geo.propagator, queue_thread=self.queue, fft_type=self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                kern.PCK = PositionCorrectionKernel(
+                    aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK.allocate()
+
+        mag_mem = 0
+        for scan, kern in self.kernels.items():
+            mag_mem = max(kern.aux.nbytes // 2, mag_mem)
+        ma_mem = mag_mem
+        blk = ma_mem + mag_mem
+
+        # We need to add the free memory from the pool to the free device memory,
+        # as both will be used for allocations
+        mempool = cp.get_default_memory_pool()
+        mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
+
+        # leave 200MB room for safety
+        fit = int(mem - 200 * 1024 * 1024) // blk
+        if not fit:
+            log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            raise SystemExit("ptypy has been exited.")
+
+        # TODO grow blocks dynamically
+        nma = min(fit, MAX_BLOCKS)
+        log_device_memory_stats(4)
+        log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma))
+        # reset memory or create new
+        self.w_data = GpuDataManager(ma_mem, 0, nma, False)
+        self.I_data = GpuDataManager(mag_mem, 0, nma, False)
+
+    def engine_prepare(self):
+
+        super().engine_prepare()
+        ## Serialize new data ##
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (
+            not self.p.object_update_cuda_atomics)
+
+        # recursive copy to gpu for probe and object
+        for _cname, c in self.ptycho.containers.items():
+            if c.original != self.pr and c.original != self.ob:
+                continue
+            for _sname, s in c.S.items():
+                # convert data here
+                s.gpu = cp.asarray(s.data)
+                s.cpu = cupyx.empty_pinned(
+                    s.data.shape, s.data.dtype, order="C")
+                s.cpu[:] = s.data
+
+        for label, d in self.ptycho.new_data:
+            prep = self.diff_info[d.ID]
+            prep.err_phot_gpu = cp.asarray(prep.err_phot)
+            prep.fic_gpu = cp.ones_like(prep.err_phot_gpu)
+
+            if use_tiles:
+                prep.addr2 = np.ascontiguousarray(
+                    np.transpose(prep.addr, (2, 3, 0, 1)))
+
+            prep.addr_gpu = cp.asarray(prep.addr)
+            if self.do_position_refinement:
+                prep.original_addr_gpu = cp.asarray(prep.original_addr)
+                prep.error_state_gpu = cp.empty_like(prep.err_phot_gpu)
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+            # Todo: Which address to pick?
+            if use_tiles:
+                prep.addr2_gpu = cp.asarray(prep.addr2)
+
+            prep.I = cupyx.empty_pinned(d.data.shape, d.data.dtype, order="C")
+            prep.I[:] = d.data
+
+            # Todo: avoid that extra copy of data
+            if self.do_position_refinement:
+                ma = self.ma.S[d.ID].data.astype(np.float32)
+                prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C")
+                prep.ma[:] = ma
+
+            log(4, 'Free memory on device: %.2f GB' %
+                (float(cp.cuda.runtime.memGetInfo()[0])/1e9))
+            self.w_data.add_data_block()
+            self.I_data.add_data_block()
+
+        self.dID_list = list(self.di.S.keys())
+
+    def _initialize_model(self):
+
+        # Create noise model
+        if self.p.ML_type.lower() == "gaussian":
+            self.ML_model = GaussianModel(self)
+        elif self.p.ML_type.lower() == "poisson":
+            raise NotImplementedError('Poisson norm model not yet implemented')
+        elif self.p.ML_type.lower() == "euclid":
+            raise NotImplementedError('Euclid norm model not yet implemented')
+        else:
+            raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
+
+    def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False):
+        """
+        Overloading the context of Storage.data here, to allow for in-place math on Container instances:
+        """
+        if container is not None:
+            if container.original == self.pr or container.original == self.ob:
+                for s in container.S.values():
+                    # convert data here
+                    if dev == 'gpu':
+                        s.data = s.gpu
+                        if sync_copy:
+                            s.gpu.set(s.cpu)
+                    elif dev == 'cpu':
+                        s.data = s.cpu
+                        if sync_copy:
+                            s.gpu.get(out=s.cpu)
+                            #print('%s to cpu' % s.ID)
+        else:
+            for container in self.ptycho.containers.values():
+                self._set_pr_ob_ref_for_data(
+                    dev=dev, container=container, sync_copy=sync_copy)
+
+    def _get_smooth_gradient(self, data, sigma):
+        if self.GSK.tmp is None:
+            self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64)
+        self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp)
+        return data
+
+    def _replace_ob_grad(self):
+        new_ob_grad = self.ob_grad_new
+        # Smoothing preconditioner
+        if self.smooth_gradient:
+            self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay)
+            for name, s in new_ob_grad.storages.items():
+                s.gpu = self._get_smooth_gradient(
+                    s.gpu, self.smooth_gradient.sigma)
+
+        return self._replace_grad(self.ob_grad, new_ob_grad)
+
+    def _replace_pr_grad(self):
+        new_pr_grad = self.pr_grad_new
+        # probe support
+        if self.p.probe_update_start <= self.curiter:
+            # Apply probe support if needed
+            for name, s in new_pr_grad.storages.items():
+                self.support_constraint(s)
+        else:
+            new_pr_grad.fill(0.)
+
+        return self._replace_grad(self.pr_grad, new_pr_grad)
+
+    def _replace_grad(self, grad, new_grad):
+        norm = np.double(0.)
+        dot = np.double(0.)
+        for name, new in new_grad.storages.items():
+            old = grad.storages[name]
+            norm += self._dot_kernel(new.gpu, new.gpu).get()[0]
+            dot += self._dot_kernel(new.gpu, old.gpu).get()[0]
+            old.gpu[:] = new.gpu
+        return norm, dot
+
+    def engine_iterate(self, num=1):
+        err = super().engine_iterate(num)
+        # copy all data back to cpu
+        self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True)
+        return err
+
+    def position_update(self):
+        """ 
+        Position refinement
+        """
+        if not self.do_position_refinement or (not self.curiter):
+            return
+        do_update_pos = (self.p.position_refinement.stop >
+                         self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter %
+                          self.p.position_refinement.interval) == 0
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (
+            not self.p.object_update_cuda_atomics)
+
+        # Update positions
+        if do_update_pos:
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.dID_list:
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr_gpu
+                original_addr = prep.original_addr
+                mangled_addr = prep.mangled_addr_gpu
+                err_phot = prep.err_phot_gpu
+                error_state = prep.error_state_gpu
+
+                # copy intensities and weights to GPU
+                ev_w, w, data_w = self.w_data.to_gpu(
+                    prep.weights, dID, self.qu_htod)
+                ev, I, data_I = self.I_data.to_gpu(prep.I, dID, self.qu_htod)
+
+                PCK = kern.PCK
+                TK = kern.TK
+                PROP = kern.PROP
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                PROP.fw(aux, aux)
+                PCK.queue.wait_event(ev)
+                # w & I now on device
+                PCK.log_likelihood_ml(aux, addr, I, w, err_phot)
+                cp.cuda.runtime.memcpy(dst=error_state.data.ptr,
+                                       src=err_phot.data.ptr,
+                                       size=err_phot.nbytes,
+                                       kind=3)  # d2d
+
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address(
+                        i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    PROP.fw(aux, aux)
+                    PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot)
+                    PCK.update_addr_and_error_state(
+                        addr, error_state, mangled_addr, err_phot)
+
+                data_w.record_done(self.queue, 'compute')
+                data_I.record_done(self.queue, 'compute')
+                cp.cuda.runtime.memcpy(dst=err_phot.data.ptr,
+                                       src=error_state.data.ptr,
+                                       size=err_phot.nbytes,
+                                       kind=3)  # d2d
+                if use_tiles:
+                    s1 = addr.shape[0] * addr.shape[1]
+                    s2 = addr.shape[2] * addr.shape[3]
+                    TK.transpose(addr.reshape(s1, s2),
+                                 prep.addr2_gpu.reshape(s2, s1))
+
+            self.dID_list.reverse()
+
+    def support_constraint(self, storage=None):
+        """
+        Enforces 2D support constraint on probe.
+        """
+        if storage is None:
+            for s in self.pr.storages.values():
+                self.support_constraint(s)
+
+        # Fourier space
+        support = self._probe_fourier_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.FSK:
+                supp = support.astype(np.complex64)
+                self.FSK[storage.ID] = FourierSupportKernel(
+                    supp, self.queue, self.p.fft_lib)
+                self.FSK[storage.ID].allocate()
+            self.FSK[storage.ID].apply_fourier_support(storage.gpu)
+
+        # Real space
+        support = self._probe_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.RSK:
+                self.RSK[storage.ID] = RealSupportKernel(
+                    support.astype(np.complex64))
+                self.RSK[storage.ID].allocate()
+            self.RSK[storage.ID].apply_real_support(storage.gpu)
+
+    def engine_finalize(self):
+        """
+        Clear all GPU data, pinned memory, etc
+        """
+        self.w_data = None
+        self.I_data = None
+
+        for name, s in self.pr.S.items():
+            s.data = s.gpu.get()  # need this, otherwise getting segfault once context is detached
+            # no longer need those
+            del s.gpu
+            del s.cpu
+        for name, s in self.ob.S.items():
+            s.data = s.gpu.get()  # need this, otherwise getting segfault once context is detached
+            # no longer need those
+            del s.gpu
+            del s.cpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+            prep.float_intens_coeff = prep.fic_gpu.get()
+
+        # self.queue.synchronize()
+        super().engine_finalize()
+
+        log_device_memory_stats(4)
+
+
+class GaussianModel(BaseModelSerial):
+    """
+    Gaussian noise model.
+    TODO: feed actual statistical weights instead of using the Poisson statistic heuristic.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Gaussian model.
+        """
+        super(GaussianModel, self).__init__(MLengine)
+
+        if self.p.reg_del2:
+            self.regularizer = Regul_del2_cupy(
+                self.p.reg_del2_amplitude,
+                queue=self.engine.queue
+            )
+        else:
+            self.regularizer = None
+
+    def prepare(self):
+
+        super(GaussianModel, self).prepare()
+
+        for label, d in self.engine.ptycho.new_data:
+            prep = self.engine.diff_info[d.ID]
+            w = (self.Irenorm * self.engine.ma.S[d.ID].data
+                 / (1. / self.Irenorm + d.data)).astype(d.data.dtype)
+            prep.weights = cupyx.empty_pinned(w.shape, w.dtype, order="C")
+            prep.weights[:] = w
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        super(GaussianModel, self).__del__()
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Gaussian noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        ob_grad = self.engine.ob_grad_new
+        pr_grad = self.engine.pr_grad_new
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        ob_grad << 0.
+        pr_grad << 0.
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+            POK = kern.POK
+            aux = kern.aux
+
+            FW = kern.PROP.fw
+            BW = kern.PROP.bw
+
+            # get addresses and auxilliary array
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            err_phot = prep.err_phot_gpu
+            # local references
+            ob = self.engine.ob.S[oID].data
+            obg = ob_grad.S[oID].data
+            pr = self.engine.pr.S[pID].data
+            prg = pr_grad.S[pID].data
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(
+                prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(aux, addr, ob, pr, add=False)
+
+            # forward prop
+            FW(aux, aux)
+            GDK.make_model(aux, addr)
+
+            queue.wait_event(ev)
+
+            if self.p.floating_intensities:
+                GDK.floating_intensity(addr, w, I, fic)
+
+            GDK.main(aux, addr, w, I)
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+            GDK.error_reduce(addr, err_phot)
+
+            BW(aux, aux)
+
+            use_atomics = self.p.object_update_cuda_atomics
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics)
+
+            use_atomics = self.p.probe_update_cuda_atomics
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics)
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        # TODO we err_phot.sum, but not necessarily this error_dct until the end of contiguous iteration
+        for dID, prep in self.engine.diff_info.items():
+            err_phot = prep.err_phot_gpu.get()
+            LL += err_phot.sum()
+            err_phot /= np.prod(prep.weights.shape[-2:])
+            err_fourier = np.zeros_like(err_phot)
+            err_exit = np.zeros_like(err_phot)
+            errs = np.ascontiguousarray(
+                np.vstack([err_fourier, err_phot, err_exit]).T)
+            error_dct.update(zip(prep.view_IDs, errs))
+
+        # MPI reduction of gradients
+
+        # DtoH copies
+        for s in ob_grad.S.values():
+            s.gpu.get(out=s.cpu)
+        for s in pr_grad.S.values():
+            s.gpu.get(out=s.cpu)
+        self.engine._set_pr_ob_ref_for_data('cpu')
+
+        ob_grad.allreduce()
+        pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # HtoD cause we continue on gpu
+        for s in ob_grad.S.values():
+            s.gpu.set(s.cpu)
+        for s in pr_grad.S.values():
+            s.gpu.set(s.cpu)
+        self.engine._set_pr_ob_ref_for_data('gpu')
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.engine.ob.storages.items():
+                ob_grad.storages[name].data += self.regularizer.grad(s.data)
+                LL += self.regularizer.LL
+
+        self.LL = LL / self.tot_measpts
+
+        return error_dct
+
+    def poly_line_coeffs(self, c_ob_h, c_pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h
+        """
+        self.engine._set_pr_ob_ref_for_data('gpu')
+        qu_htod = self.engine.qu_htod
+        queue = self.engine.queue
+
+        # does not accept np.longdouble
+        B = cp.zeros((3,), dtype=np.float32)
+        Brenorm = 1. / self.LL[0] ** 2
+
+        # Outer loop: through diffraction patterns
+        for dID in self.engine.dID_list:
+            prep = self.engine.diff_info[dID]
+
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # references for kernels
+            kern = self.engine.kernels[prep.label]
+            GDK = kern.GDK
+            AWK = kern.AWK
+
+            f = kern.aux
+            a = kern.a
+            b = kern.b
+
+            FW = kern.PROP.fw
+
+            # get addresses and auxiliary arrays
+            addr = prep.addr_gpu
+            fic = prep.fic_gpu
+
+            # Schedule w & I to device
+            ev_w, w, data_w = self.engine.w_data.to_gpu(
+                prep.weights, dID, qu_htod)
+            ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod)
+
+            # local references
+            ob = self.ob.S[oID].data
+            ob_h = c_ob_h.S[oID].data
+            pr = self.pr.S[pID].data
+            pr_h = c_pr_h.S[pID].data
+
+            # make propagated exit (to buffer)
+            AWK.build_aux_no_ex(f, addr, ob, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False)
+            AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True)
+            AWK.build_aux_no_ex(b, addr, ob_h, pr_h, add=False)
+
+            # forward prop
+            FW(f, f)
+            FW(a, a)
+            FW(b, b)
+
+            queue.wait_event(ev)
+
+            GDK.make_a012(f, a, b, addr, I, fic)
+            GDK.fill_b(addr, Brenorm, w, B)
+
+            data_w.record_done(queue, 'compute')
+            data_I.record_done(queue, 'compute')
+
+        queue.synchronize()
+        self.engine.dID_list.reverse()
+
+        B = B.get()
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    c_ob_h.storages[name].data, s.data)
+
+        self.B = B
+
+        return B
+
+
+class Regul_del2_cupy(object):
+    """\
+    Squared gradient regularizer (Gaussian prior).
+
+    This class applies to any numpy array.
+    """
+
+    def __init__(self, amplitude, axes=[-2, -1], queue=None):
+        # Regul.__init__(self, axes)
+        self.axes = axes
+        self.amplitude = amplitude
+        self.delxy = None
+        self.g = None
+        self.LL = None
+        self.queue = queue
+        self.AUK = ArrayUtilsKernel(queue=queue)
+        self.DELK_c = DerivativesKernel(np.complex64, queue=queue)
+        self.DELK_f = DerivativesKernel(np.float32, queue=queue)
+
+
+        def empty(x): return cp.empty(
+            x.shape, x.dtype)
+
+        def delxb(x, axis=-1):
+            out = empty(x)
+            if x.dtype == np.float32:
+                self.DELK_f.delxb(x, out, axis)
+            elif x.dtype == np.complex64:
+                self.DELK_c.delxb(x, out, axis)
+            else:
+                raise TypeError("Type %s invalid for derivatives" % x.dtype)
+            return out
+
+        self.delxb = delxb
+
+        def delxf(x, axis=-1):
+            out = empty(x)
+            if x.dtype == np.float32:
+                self.DELK_f.delxf(x, out, axis)
+            elif x.dtype == np.complex64:
+                self.DELK_c.delxf(x, out, axis)
+            else:
+                raise TypeError("Type %s invalid for derivatives" % x.dtype)
+            return out
+
+        self.delxf = delxf
+        self.norm = lambda x: self.AUK.norm2(x).get().item()
+        self.dot = lambda x, y: self.AUK.dot(x, y).get().item()
+
+        self._grad_reg_kernel = cp.ElementwiseKernel(
+            "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx", 
+            "complex64 out",
+            "out = (px+py-my-mx) * fac",
+            "grad_reg",
+            no_return=True
+        )
+
+        def grad(amp, px, py, mx, my):
+            out = empty(px)
+            if self.queue is not None:
+                self.queue.use()
+            self._grad_reg_kernel(amp, py, px, mx, my, out)
+            return out
+        self.reg_grad = grad
+
+    def grad(self, x):
+        """
+        Compute and return the regularizer gradient given the array x.
+        """
+        ax0, ax1 = self.axes
+        del_xf = self.delxf(x, axis=ax0)
+        del_yf = self.delxf(x, axis=ax1)
+        del_xb = self.delxb(x, axis=ax0)
+        del_yb = self.delxb(x, axis=ax1)
+
+        self.delxy = [del_xf, del_yf, del_xb, del_yb]
+
+        # TODO this one might be slow, maybe try with elementwise kernel
+        #self.g = (del_xb + del_yb - del_xf - del_yf) * 2. * self.amplitude
+        self.g = self.reg_grad(2. * self.amplitude,
+                               del_xb, del_yb, del_xf, del_yf)
+
+        self.LL = self.amplitude * (self.norm(del_xf)
+                                    + self.norm(del_yf)
+                                    + self.norm(del_xb)
+                                    + self.norm(del_yb))
+
+        return self.g
+
+    def poly_line_coeffs(self, h, x=None):
+        ax0, ax1 = self.axes
+        if x is None:
+            del_xf, del_yf, del_xb, del_yb = self.delxy
+        else:
+            del_xf = self.delxf(x, axis=ax0)
+            del_yf = self.delxf(x, axis=ax1)
+            del_xb = self.delxb(x, axis=ax0)
+            del_yb = self.delxb(x, axis=ax1)
+
+        hdel_xf = self.delxf(h, axis=ax0)
+        hdel_yf = self.delxf(h, axis=ax1)
+        hdel_xb = self.delxb(h, axis=ax0)
+        hdel_yb = self.delxb(h, axis=ax1)
+
+        c0 = self.amplitude * (self.norm(del_xf)
+                               + self.norm(del_yf)
+                               + self.norm(del_xb)
+                               + self.norm(del_yb))
+
+        c1 = 2 * self.amplitude * (self.dot(del_xf, hdel_xf)
+                                   + self.dot(del_yf, hdel_yf)
+                                   + self.dot(del_xb, hdel_xb)
+                                   + self.dot(del_yb, hdel_yb))
+
+        c2 = self.amplitude * (self.norm(hdel_xf)
+                               + self.norm(hdel_yf)
+                               + self.norm(hdel_xb)
+                               + self.norm(hdel_yb))
+
+        self.coeff = np.array([c0, c1, c2])
+        return self.coeff
diff --git a/ptypy/accelerate/cuda_cupy/engines/__init__.py b/ptypy/accelerate/cuda_cupy/engines/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
new file mode 100644
index 000000000..f0c6ba40a
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
@@ -0,0 +1,636 @@
+# -*- coding: utf-8 -*-
+"""
+Difference Map reconstruction engine.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+
+import numpy as np
+import time
+import cupy as cp
+
+from ptypy import utils as u
+from ptypy.accelerate.cuda_cupy import get_context, log_device_memory_stats
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.engines import register
+from ptypy.engines.projectional import DMMixin, RAARMixin
+from ptypy.accelerate.base.engines import projectional_serial
+from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
+from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\
+    TransposeKernel, ClipMagnitudesKernel, MassCenterKernel, Abs2SumKernel,\
+    InterpolatedShiftKernel
+from ..mem_utils import make_pagelocked_paired_arrays as mppa
+from ..multi_gpu import get_multi_gpu_communicator
+
+__all__ = ['DM_cupy', 'RAAR_cupy']
+
+
+class _ProjectionEngine_cupy(projectional_serial._ProjectionEngine_serial):
+
+    """
+    Defaults:
+
+    [probe_update_cuda_atomics]
+    default = False
+    type = bool
+    help = For GPU, use the atomics version for probe update kernel
+
+    [object_update_cuda_atomics]
+    default = True
+    type = bool
+    help = For GPU, use the atomics version for object update kernel
+
+    [fft_lib]
+    default = cuda
+    type = str
+    help = Choose the pycuda-compatible FFT module.
+    doc = One of:
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'cupy'`` : cupy's cuFFT wrapper (fast load, slowest compute due to additional store/load stages)
+    choices = 'cuda','cupy'
+    userlevel = 2
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Difference map reconstruction engine.
+        """
+        super().__init__(ptycho_parent, pars)
+        self.multigpu = None
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        # Context, Multi GPU communicator and Stream (needs to be in this order)
+        self.queue = get_context(new_queue=False)
+        self.multigpu = get_multi_gpu_communicator()
+
+        # Gaussian Smoothing Kernel
+        self.GSK = GaussianSmoothingKernel(queue=self.queue)
+
+        # Real/Fourier Support Kernel
+        self.RSK = {}
+        self.FSK = {}
+
+        # Clip Magnitudes Kernel
+        self.CMK = ClipMagnitudesKernel(queue=self.queue)
+
+        # initialise kernels for centring probe if required
+        if self.p.probe_center_tol is not None:
+            # mass center kernel
+            self.MCK = MassCenterKernel(queue=self.queue)
+            # absolute sum kernel
+            self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue)
+            # interpolated shift kernel
+            self.ISK = InterpolatedShiftKernel(queue=self.queue)
+
+        super().engine_initialize()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = scan.max_frames_per_block
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                    scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (fpc * nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = cp.asarray(aux)
+
+            # setup kernels, one for each SCAN.
+            log(4, "Setting up FourierUpdateKernel")
+            kern.FUK = FourierUpdateKernel(
+                aux, nmodes, queue_thread=self.queue)
+            kern.FUK.allocate()
+
+            log(4, "Setting up PoUpdateKernel")
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            log(4, "Setting up AuxiliaryWaveKernel")
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            log(4, "Setting up ArrayUtilsKernel")
+            kern.AUK = ArrayUtilsKernel(queue=self.queue)
+
+            log(4, "Setting up TransposeKernel")
+            kern.TK = TransposeKernel(queue=self.queue)
+
+            log(4, "Setting up PropagationKernel")
+            kern.PROP = PropagationKernel(
+                aux, geo.propagator, self.queue, self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                log(4, "Setting up PositionCorrectionKernel")
+                kern.PCK = PositionCorrectionKernel(
+                    aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK.allocate()
+            log(4, "Kernel setup completed")
+
+    def engine_prepare(self):
+
+        super().engine_prepare()
+
+        for name, s in self.ob.S.items():
+            s.gpu = cp.asarray(s.data)  # TODO: investigate if this can be pinned, it's much faster
+        for name, s in self.ob_buf.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.ob_nrm.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr_buf.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr_nrm.S.items():
+            s.gpu, s.data = mppa(s.data)
+
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (
+            not self.p.object_update_cuda_atomics)
+
+        # TODO : like the serialization this one is needed due to object reformatting
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.addr_gpu = cp.asarray(prep.addr)
+            if use_tiles:
+                prep.addr2 = np.ascontiguousarray(
+                    np.transpose(prep.addr, (2, 3, 0, 1)))
+                prep.addr2_gpu = cp.asarray(prep.addr2)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+        for label, d in self.ptycho.new_data:
+            prep = self.diff_info[d.ID]
+            pID, oID, eID = prep.poe_IDs
+            s = self.ex.S[eID]
+            s.gpu = cp.asarray(s.data)
+            s = self.ma.S[d.ID]
+            s.gpu = cp.asarray(s.data.astype(np.float32))
+
+            prep.mag = cp.asarray(prep.mag)
+            prep.ma_sum = cp.asarray(prep.ma_sum)
+            prep.err_fourier_gpu = cp.asarray(prep.err_fourier)
+            prep.err_phot_gpu = cp.asarray(prep.err_phot)
+            prep.err_exit_gpu = cp.asarray(prep.err_exit)
+            if self.do_position_refinement:
+                prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu)
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        queue = self.queue
+        queue.use()
+
+        for it in range(num):
+            error = {}
+            for dID in self.di.S.keys():
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                PROP = kern.PROP
+
+                # get addresses and buffers
+                addr = prep.addr_gpu
+                mag = prep.mag
+                ma_sum = prep.ma_sum
+                err_fourier = prep.err_fourier_gpu
+                err_phot = prep.err_phot_gpu
+                err_exit = prep.err_exit_gpu
+                pbound = self.pbound_scan[prep.label]
+                aux = kern.aux
+
+                # local references
+                ma = self.ma.S[dID].gpu
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+                ex = self.ex.S[eID].gpu
+
+                # compute log-likelihood
+                if self.p.compute_log_likelihood:
+                    AWK.build_aux_no_ex(aux, addr, ob, pr)
+                    PROP.fw(aux, aux)
+                    FUK.log_likelihood(aux, addr, mag, ma, err_phot)
+
+                # build auxilliary wave
+                #AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha)
+                AWK.make_aux(aux, addr, ob, pr, ex,
+                             c_po=self._c, c_e=1-self._c)
+
+                # forward FFT
+                PROP.fw(aux, aux)
+
+                # Deviation from measured data
+                FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                FUK.error_reduce(addr, err_fourier)
+                FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
+
+                # backward FFT
+                PROP.bw(aux, aux)
+
+                # build exit wave
+                #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
+                AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b,
+                              c_po=self._a, c_e=-(self._a + self._b))
+                FUK.exit_error(aux, addr)
+                FUK.error_reduce(addr, err_exit)
+
+            parallel.barrier()
+
+            sync = (self.curiter % 1 == 0)
+            self.overlap_update()
+
+            self.center_probe()
+
+            parallel.barrier()
+            self.position_update()
+
+            self.curiter += 1
+            queue.synchronize()
+
+        for name, s in self.ob.S.items():
+            cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
+        for name, s in self.pr.S.items():
+            cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
+
+        queue.synchronize()
+
+        # costly but needed to sync back with
+        # for name, s in self.ex.S.items():
+        #     s.data[:] = s.gpu.get()
+        for dID, prep in self.diff_info.items():
+            err_fourier = prep.err_fourier_gpu.get()
+            err_phot = prep.err_phot_gpu.get()
+            err_exit = prep.err_exit_gpu.get()
+            errs = np.ascontiguousarray(
+                np.vstack([err_fourier, err_phot, err_exit]).T)
+            error.update(zip(prep.view_IDs, errs))
+
+        self.error = error
+        return error
+
+    def position_update(self):
+        """
+        Position refinement
+        """
+        if not self.do_position_refinement or (not self.curiter):
+            return
+        do_update_pos = (self.p.position_refinement.stop >
+                         self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter %
+                          self.p.position_refinement.interval) == 0
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (
+            not self.p.object_update_cuda_atomics)
+
+        # Update positions
+        if do_update_pos:
+            self.queue.use()
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.di.S.keys():
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ma = self.ma.S[dID].gpu
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr_gpu
+                original_addr = prep.original_addr
+                mangled_addr = prep.mangled_addr_gpu
+                mag = prep.mag
+                ma_sum = prep.ma_sum
+                err_fourier = prep.err_fourier_gpu
+                error_state = prep.error_state_gpu
+
+                PCK = kern.PCK
+                TK = kern.TK
+                PROP = kern.PROP
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                PROP.fw(aux, aux)
+                if self.p.position_refinement.metric == "fourier":
+                    PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                    PCK.error_reduce(addr, err_fourier)
+                if self.p.position_refinement.metric == "photon":
+                    PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr,
+                                            src=err_fourier.data.ptr,
+                                            size=err_fourier.nbytes,
+                                            kind=3,  # device to device
+                                            stream=self.queue.ptr)
+
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address(
+                        i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    PROP.fw(aux, aux)
+                    if self.p.position_refinement.metric == "fourier":
+                        PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                        PCK.error_reduce(mangled_addr, err_fourier)
+                    if self.p.position_refinement.metric == "photon":
+                        PCK.log_likelihood(
+                            aux, mangled_addr, mag, ma, err_fourier)
+                    PCK.update_addr_and_error_state(
+                        addr, error_state, mangled_addr, err_fourier)
+
+                cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
+                                            src=error_state.data.ptr,
+                                            size=err_fourier.nbytes,
+                                            kind=3,
+                                            stream=self.queue.ptr)  # d2d
+                if use_tiles:
+                    s1 = addr.shape[0] * addr.shape[1]
+                    s2 = addr.shape[2] * addr.shape[3]
+                    TK.transpose(addr.reshape(s1, s2),
+                                 prep.addr2_gpu.reshape(s2, s1))
+
+    def center_probe(self):
+        if self.p.probe_center_tol is not None:
+            self.queue.use()
+            for name, pr_s in self.pr.storages.items():
+                psum_d = self.A2SK.abs2sum(pr_s.gpu)
+                c1 = self.MCK.mass_center(psum_d).get()
+                c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype)
+
+                shift = c2 - c1
+                # exit if the current center of mass is within the tolerance
+                if u.norm(shift) < self.p.probe_center_tol:
+                    break
+
+                # shift the probe
+                pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift)
+
+                # shift the object
+                ob_s = pr_s.views[0].pod.ob_view.storage
+                ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift)
+
+                # shift the exit waves
+                for dID in self.di.S.keys():
+                    prep = self.diff_info[dID]
+                    pID, oID, eID = prep.poe_IDs
+                    if pID == name:
+                        self.ex.S[eID].gpu = self.ISK.interpolate_shift(
+                            self.ex.S[eID].gpu, shift)
+
+                log(4, 'Probe recentered from %s to %s'
+                    % (str(tuple(c1)), str(tuple(c2))))
+
+    # object update
+
+    def object_update(self, MPI=False):
+        use_atomics = self.p.object_update_cuda_atomics
+        queue = self.queue
+        queue.synchronize()
+        queue.use()
+        for oID, ob in self.ob.storages.items():
+            obn = self.ob_nrm.S[oID]
+            cfact = self.ob_cfact[oID]
+
+            if self.p.obj_smooth_std is not None:
+                log(4, 'Smoothing object, cfact is %.2f' % cfact)
+                obb = self.ob_buf.S[oID]
+                smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std]
+                self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu)
+
+            ob.gpu *= cfact
+            obn.gpu.fill(cfact)
+            queue.synchronize()
+
+        # storage for-loop
+        for dID in self.di.S.keys():
+            prep = self.diff_info[dID]
+
+            POK = self.kernels[prep.label].POK
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # scan for loop
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            ev = POK.ob_update(addr,
+                               self.ob.S[oID].gpu,
+                               self.ob_nrm.S[oID].gpu,
+                               self.pr.S[pID].gpu,
+                               self.ex.S[eID].gpu,
+                               atomics=use_atomics)
+            queue.synchronize()
+
+        for oID, ob in self.ob.storages.items():
+            obn = self.ob_nrm.S[oID]
+            self.multigpu.allReduceSum(ob.gpu)
+            self.multigpu.allReduceSum(obn.gpu)
+            with queue:
+                ob.gpu /= obn.gpu
+
+            self.clip_object(ob.gpu)
+            queue.synchronize()
+
+    # probe update
+    def probe_update(self, MPI=False):
+        queue = self.queue
+
+        # storage for-loop
+        change_gpu = cp.zeros((1,), dtype=np.float32)
+        cfact = self.p.probe_inertia
+        use_atomics = self.p.probe_update_cuda_atomics
+        for pID, pr in self.pr.storages.items():
+            prn = self.pr_nrm.S[pID]
+            cfact = self.pr_cfact[pID]
+            pr.gpu *= cfact
+            prn.gpu.fill(cfact)
+
+        for dID in self.di.S.keys():
+            prep = self.diff_info[dID]
+
+            POK = self.kernels[prep.label].POK
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            # scan for-loop
+            addr = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            ev = POK.pr_update(addr,
+                               self.pr.S[pID].gpu,
+                               self.pr_nrm.S[pID].gpu,
+                               self.ob.S[oID].gpu,
+                               self.ex.S[eID].gpu,
+                               atomics=use_atomics)
+            queue.synchronize()
+
+        for pID, pr in self.pr.storages.items():
+
+            buf = self.pr_buf.S[pID]
+            prn = self.pr_nrm.S[pID]
+
+            self.multigpu.allReduceSum(pr.gpu)
+            self.multigpu.allReduceSum(prn.gpu)
+            pr.gpu /= prn.gpu
+            self.support_constraint(pr)
+
+            # calculate change on GPU
+            queue.synchronize()
+            AUK = self.kernels[list(self.kernels)[0]].AUK
+            buf.gpu -= pr.gpu
+            change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu))
+            buf.gpu[:] = pr.gpu
+            self.multigpu.allReduceSum(change_gpu)
+            change = change_gpu.get().item() / parallel.size
+
+        return np.sqrt(change)
+
+    def support_constraint(self, storage=None):
+        """
+        Enforces 2D support constraint on probe.
+        """
+        if storage is None:
+            for s in self.pr.storages.values():
+                self.support_constraint(s)
+
+        # Fourier space
+        support = self._probe_fourier_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.FSK:
+                supp = support.astype(np.complex64)
+                self.FSK[storage.ID] = FourierSupportKernel(
+                    supp, self.queue, self.p.fft_lib)
+                self.FSK[storage.ID].allocate()
+            self.FSK[storage.ID].apply_fourier_support(storage.gpu)
+
+        # Real space
+        support = self._probe_support.get(storage.ID)
+        if support is not None:
+            if storage.ID not in self.RSK:
+                self.RSK[storage.ID] = RealSupportKernel(
+                    support.astype(np.complex64))
+                self.RSK[storage.ID].allocate()
+            self.RSK[storage.ID].apply_real_support(storage.gpu)
+
+    def clip_object(self, ob):
+        """
+        Clips magnitudes of object into given range.
+        """
+        if self.p.clip_object is not None:
+            cmin, cmax = self.p.clip_object
+            self.CMK.clip_magnitudes_to_range(ob, cmin, cmax)
+
+    def engine_finalize(self):
+        """
+        clear GPU data and destroy context.
+        """
+        # revert page-locked memory + delete GPU memory
+        for name, s in self.ob.S.items():
+            s.data = np.copy(s.data)
+            del s.gpu
+        for name, s in self.ob_buf.S.items():
+            s.data = np.copy(s.data)
+            del s.gpu
+        for name, s in self.ob_nrm.S.items():
+            s.data = np.copy(s.data)
+            del s.gpu
+        for name, s in self.pr.S.items():
+            s.data = np.copy(s.data)
+            del s.gpu
+        for name, s in self.pr_buf.S.items():
+            s.data = np.copy(s.data)
+            del s.gpu
+        for name, s in self.pr_nrm.S.items():
+            s.data = np.copy(s.data)
+            del s.gpu
+
+        # copy addr to cpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+            del prep.addr_gpu
+
+
+        mempool = cp.get_default_memory_pool()
+        mempool.free_all_blocks()
+        pinned_pool = cp.get_default_pinned_memory_pool()
+        pinned_pool.free_all_blocks()
+
+
+        # we don't need the  "benchmarking" in DM_serial
+        super().engine_finalize(benchmark=False)
+
+
+@register(name="DM_cupy_nostream")
+class DM_cupy(_ProjectionEngine_cupy, DMMixin):
+    """
+    A full-fledged Difference Map engine accelerated with pycuda.
+
+    Defaults:
+
+    [name]
+    default = DM_cupy
+    type = str
+    help =
+    doc =
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        _ProjectionEngine_cupy.__init__(self, ptycho_parent, pars)
+        DMMixin.__init__(self, self.p.alpha)
+        ptycho_parent.citations.add_article(**self.article)
+
+
+@register(name="RAAR_cupy_nostream")
+class RAAR_cupy(_ProjectionEngine_cupy, RAARMixin):
+    """
+    A RAAR engine in accelerated with pycuda.
+
+    Defaults:
+
+    [name]
+    default = RAAR_pycuda
+    type = str
+    help =
+    doc =
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        _ProjectionEngine_cupy.__init__(self, ptycho_parent, pars)
+        RAARMixin.__init__(self, self.p.beta)
diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
new file mode 100644
index 000000000..b64ad5e82
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
@@ -0,0 +1,556 @@
+# -*- coding: utf-8 -*-
+"""
+Difference Map reconstruction engine for NVIDIA GPUs.
+
+This engine uses three streams, one for the compute queue and one for each I/O queue.
+Events are used to synchronize download / compute/ upload. we cannot manipulate memory
+for each loop over the state vector, a certain number of memory sections is preallocated
+and reused.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+
+import numpy as np
+import time
+import cupy as cp
+import cupyx
+
+from ptypy import utils as u
+from ptypy.accelerate.cuda_cupy import log_device_memory_stats
+from ptypy.utils.verbose import log, logger
+from ptypy.utils import parallel
+from ptypy.engines import register
+from ptypy.engines.projectional import DMMixin, RAARMixin
+from . import projectional_cupy
+
+from ..mem_utils import make_pagelocked_paired_arrays as mppa
+from ..mem_utils import GpuDataManager
+
+EX_MA_BLOCKS_RATIO = 2
+# can be used to limit the number of blocks, simulating that they don't fit
+MAX_BLOCKS = 99999
+# MAX_BLOCKS = 3  # can be used to limit the number of blocks, simulating that they don't fit
+
+__all__ = ['DM_cupy_stream', 'RAAR_cupy_stream']
+
+
+class _ProjectionEngine_cupy_stream(projectional_cupy._ProjectionEngine_cupy):
+
+    def __init__(self, ptycho_parent, pars=None):
+
+        super().__init__(ptycho_parent, pars)
+        self.ma_data = None
+        self.mag_data = None
+        self.ex_data = None
+
+    def engine_initialize(self):
+        super().engine_initialize()
+        self.qu_htod = cp.cuda.Stream()
+        self.qu_dtoh = cp.cuda.Stream()
+
+    def _setup_kernels(self):
+        
+        super()._setup_kernels()
+        ex_mem = 0
+        mag_mem = 0
+        for scan, kern in self.kernels.items():
+            ex_mem = max(kern.aux.nbytes, ex_mem)
+            mag_mem = max(kern.FUK.gpu.fdev.nbytes, mag_mem)
+        ma_mem = mag_mem
+        
+        blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
+        
+        # We need to add the free memory from the pool to the free device memory,
+        # as both will be used for allocations
+        mempool = cp.get_default_memory_pool()
+        mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
+        
+        # leave 200MB room for safety
+        fit = int(mem - 200 * 1024 * 1024) // blk
+        if not fit:
+            log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            raise SystemExit("ptypy has been exited.")
+
+        # TODO grow blocks dynamically
+        nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
+        nma = min(fit, MAX_BLOCKS)
+        log_device_memory_stats(4)
+        log(4, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(
+            nex, nma))
+        # reset memory or create new
+        self.ex_data = GpuDataManager(ex_mem, 0, nex, True)
+        self.ma_data = GpuDataManager(ma_mem, 0, nma, False)
+        self.mag_data = GpuDataManager(mag_mem, 0, nma, False)
+
+    def engine_prepare(self):
+
+        super(projectional_cupy._ProjectionEngine_cupy, self).engine_prepare()
+
+        for name, s in self.ob.S.items():
+            s.gpu = cp.asarray(s.data)
+        for name, s in self.ob_buf.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.ob_nrm.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr_buf.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr_nrm.S.items():
+            s.gpu, s.data = mppa(s.data)
+
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (
+            not self.p.object_update_cuda_atomics)
+
+        # Extra object buffer for smoothing kernel
+        if self.p.obj_smooth_std is not None:
+            for name, s in self.ob_buf.S.items():
+                s.tmp = cp.empty(s.gpu.shape, s.gpu.dtype)
+
+        # TODO : like the serialization this one is needed due to object reformatting
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.addr_gpu = cp.asarray(prep.addr)
+            if use_tiles:
+                prep.addr2 = np.ascontiguousarray(
+                    np.transpose(prep.addr, (2, 3, 0, 1)))
+                prep.addr2_gpu = cp.asarray(prep.addr2)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+        for label, d in self.ptycho.new_data:
+            dID = d.ID
+            prep = self.diff_info[dID]
+            pID, oID, eID = prep.poe_IDs
+
+            prep.ma_sum_gpu = cp.asarray(prep.ma_sum)
+            # prepare page-locked mems:
+            prep.err_fourier_gpu = cp.asarray(prep.err_fourier)
+            prep.err_phot_gpu = cp.asarray(prep.err_phot)
+            prep.err_exit_gpu = cp.asarray(prep.err_exit)
+            if self.do_position_refinement:
+                prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu)
+            ma = self.ma.S[dID].data.astype(np.float32)
+            prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C")
+            prep.ma[:] = ma
+            ex = self.ex.S[eID].data
+            prep.ex = cupyx.empty_pinned(ex.shape, ex.dtype, order="C")
+            prep.ex[:] = ex
+            mag = prep.mag
+            prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C")
+            prep.mag[:] = mag
+
+            log(4, 'Free memory on device: %.2f GB' %
+                (float(cp.cuda.runtime.memGetInfo()[0])/1e9))
+            self.ex_data.add_data_block()
+            self.ma_data.add_data_block()
+            self.mag_data.add_data_block()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        # ma_buf = ma_c = np.zeros(FUK.fshape, dtype=np.float32)
+        self.dID_list = list(self.di.S.keys())
+        atomics_probe = self.p.probe_update_cuda_atomics
+        atomics_object = self.p.object_update_cuda_atomics
+        use_tiles = (not atomics_object) or (not atomics_probe)
+
+        for it in range(num):
+
+            error = {}
+
+            for inner in range(self.p.overlap_max_iterations):
+
+                change = 0
+
+                do_update_probe = (self.curiter >= self.p.probe_update_start)
+                do_update_object = (self.p.update_object_first or (
+                    inner > 0) or not do_update_probe)
+                do_update_fourier = (inner == 0)
+
+                # initialize probe and object buffer to receive an update
+                if do_update_object:
+                    for oID, ob in self.ob.storages.items():
+                        cfact = self.ob_cfact[oID]
+                        obn = self.ob_nrm.S[oID]
+                        obb = self.ob_buf.S[oID]
+
+                        if self.p.obj_smooth_std is not None:
+                            log(4, 'Smoothing object, cfact is %.2f' % cfact)
+                            smooth_mfs = [self.p.obj_smooth_std,
+                                          self.p.obj_smooth_std]
+                            # We need a third copy, because we still need ob.gpu for the fourier update
+                            obb.gpu[:] = ob.gpu[:]
+                            self.GSK.convolution(
+                                obb.gpu, smooth_mfs, tmp=obb.tmp)
+                            obb.gpu *= np.complex64(cfact)
+                        else:
+                            # obb.gpu[:] = ob.gpu * np.complex64(cfact)
+                            cp.multiply(ob.gpu, np.complex64(cfact), out=obb.gpu)
+                        obn.gpu.fill(np.float32(cfact))
+
+                # First cycle: Fourier + object update
+                for iblock, dID in enumerate(self.dID_list):
+                    prep = self.diff_info[dID]
+
+                    # find probe, object in exit ID in dependence of dID
+                    pID, oID, eID = prep.poe_IDs
+
+                    # references for kernels
+                    kern = self.kernels[prep.label]
+                    FUK = kern.FUK
+                    AWK = kern.AWK
+                    POK = kern.POK
+
+                    pbound = self.pbound_scan[prep.label]
+                    aux = kern.aux
+                    PROP = kern.PROP
+
+                    # get addresses and auxilliary array
+                    addr = prep.addr_gpu
+                    addr2 = prep.addr2_gpu if use_tiles else None
+                    err_fourier = prep.err_fourier_gpu
+                    err_phot = prep.err_phot_gpu
+                    err_exit = prep.err_exit_gpu
+                    ma_sum = prep.ma_sum_gpu
+
+                    # local references
+                    ob = self.ob.S[oID].gpu
+                    obn = self.ob_nrm.S[oID].gpu
+                    obb = self.ob_buf.S[oID].gpu
+                    pr = self.pr.S[pID].gpu
+
+                    # Schedule ex to device
+                    ev_ex, ex, data_ex = self.ex_data.to_gpu(
+                        prep.ex, dID, self.qu_htod)
+
+                    # Fourier update.
+                    if do_update_fourier:
+                        self.ex_data.syncback = True
+                        log(4, '----- Fourier update -----', True)
+
+                        # Schedule ma & mag to device
+                        ev_ma, ma, data_ma = self.ma_data.to_gpu(
+                            prep.ma, dID, self.qu_htod)
+                        ev_mag, mag, data_mag = self.mag_data.to_gpu(
+                            prep.mag, dID, self.qu_htod)
+
+                        # compute log-likelihood
+                        if self.p.compute_log_likelihood:
+                            AWK.build_aux_no_ex(aux, addr, ob, pr)
+                            PROP.fw(aux, aux)
+                            # synchronize h2d stream with compute stream
+                            self.queue.wait_event(ev_mag)
+                            FUK.log_likelihood(aux, addr, mag, ma, err_phot)
+
+                        # synchronize h2d stream with compute stream
+                        self.queue.wait_event(ev_ex)
+                        #AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha)
+                        AWK.make_aux(aux, addr, ob, pr, ex,
+                                     c_po=self._c, c_e=1-self._c)
+
+                        # FFT
+                        PROP.fw(aux, aux)
+
+                        # Deviation from measured data
+                        # synchronize h2d stream with compute stream
+                        self.queue.wait_event(ev_mag)
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                        FUK.fmag_all_update(
+                            aux, addr, mag, ma, err_fourier, pbound)
+
+                        data_mag.record_done(self.queue, 'compute')
+                        data_ma.record_done(self.queue, 'compute')
+
+                        PROP.bw(aux, aux)
+                        # apply changes
+                        #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
+                        AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b,
+                                      c_po=self._a, c_e=-(self._a + self._b))
+                        FUK.exit_error(aux, addr)
+                        FUK.error_reduce(addr, err_exit)
+
+                    prestr = '%d Iteration (Overlap) #%02d:  ' % (
+                        parallel.rank, inner)
+
+                    # Update object
+                    if do_update_object:
+                        log(4, prestr + '----- object update -----', True)
+                        addrt = addr if atomics_object else addr2
+                        self.queue.wait_event(ev_ex)
+                        POK.ob_update(addrt, obb, obn, pr, ex,
+                                      atomics=atomics_object)
+
+                    data_ex.record_done(self.queue, 'compute')
+                    if iblock + len(self.ex_data) < len(self.dID_list):
+                        data_ex.from_gpu(self.qu_dtoh)
+
+                # swap direction
+                if do_update_fourier or do_update_object:
+                    self.dID_list.reverse()
+
+                # wait for compute stream to finish
+                self.queue.synchronize()
+
+                if do_update_object:
+
+                    for oID, ob in self.ob.storages.items():
+                        obn = self.ob_nrm.S[oID]
+                        obb = self.ob_buf.S[oID]
+                        self.multigpu.allReduceSum(obb.gpu)
+                        self.multigpu.allReduceSum(obn.gpu)
+                        obb.gpu /= obn.gpu
+
+                        self.clip_object(obb.gpu)
+                        ob.gpu[:] = obb.gpu
+
+                # Exit if probe should not yet be updated
+                if not do_update_probe:
+                    break
+                self.ex_data.syncback = False
+
+                # Update probe
+                log(4, prestr + '----- probe update -----', True)
+                change = self.probe_update()
+                log(4, prestr + 'change in probe is %.3f' % change, True)
+
+                # stop iteration if probe change is small
+                if change < self.p.overlap_converge_factor:
+                    break
+
+            self.queue.synchronize()
+            parallel.barrier()
+
+            if self.do_position_refinement and (self.curiter):
+                do_update_pos = (self.p.position_refinement.stop >
+                                 self.curiter >= self.p.position_refinement.start)
+                do_update_pos &= (self.curiter %
+                                  self.p.position_refinement.interval) == 0
+
+                # Update positions
+                if do_update_pos:
+                    """
+                    Iterates through all positions and refines them by a given algorithm. 
+                    """
+                    log(4, "----------- START POS REF -------------")
+                    for dID in self.di.S.keys():
+
+                        prep = self.diff_info[dID]
+                        pID, oID, eID = prep.poe_IDs
+                        ob = self.ob.S[oID].gpu
+                        pr = self.pr.S[pID].gpu
+                        kern = self.kernels[prep.label]
+                        aux = kern.aux
+                        addr = prep.addr_gpu
+                        original_addr = prep.original_addr
+                        mangled_addr = prep.mangled_addr_gpu
+                        ma_sum = prep.ma_sum_gpu
+                        err_fourier = prep.err_fourier_gpu
+                        error_state = prep.error_state_gpu
+
+                        PCK = kern.PCK
+                        TK = kern.TK
+                        PROP = kern.PROP
+
+                        # Make sure our data arrays are on device
+                        ev_ma, ma, data_ma = self.ma_data.to_gpu(
+                            prep.ma, dID, self.qu_htod)
+                        ev_mag, mag, data_mag = self.mag_data.to_gpu(
+                            prep.mag, dID, self.qu_htod)
+
+                        # Keep track of object boundaries
+                        max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                        max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                        # We need to re-calculate the current error
+                        PCK.build_aux(aux, addr, ob, pr)
+                        PROP.fw(aux, aux)
+                        # wait for data to arrive
+                        self.queue.wait_event(ev_mag)
+
+                        # We need to re-calculate the current error
+                        if self.p.position_refinement.metric == "fourier":
+                            PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                            PCK.error_reduce(addr, err_fourier)
+                        if self.p.position_refinement.metric == "photon":
+                            PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                        cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr,
+                                                    src=err_fourier.data.ptr,
+                                                    size=err_fourier.nbytes,
+                                                    kind=3,  # device to device
+                                                    stream=self.queue.ptr)
+
+                        log(4, 'Position refinement trial: iteration %s' %
+                            (self.curiter))
+                        PCK.mangler.setup_shifts(
+                            self.curiter, nframes=addr.shape[0])
+                        for i in range(PCK.mangler.nshifts):
+                            PCK.mangler.get_address(
+                                i, addr, mangled_addr, max_oby, max_obx)
+                            PCK.build_aux(aux, mangled_addr, ob, pr)
+                            PROP.fw(aux, aux)
+                            if self.p.position_refinement.metric == "fourier":
+                                PCK.fourier_error(
+                                    aux, mangled_addr, mag, ma, ma_sum)
+                                PCK.error_reduce(mangled_addr, err_fourier)
+                            if self.p.position_refinement.metric == "photon":
+                                PCK.log_likelihood(
+                                    aux, mangled_addr, mag, ma, err_fourier)
+                            PCK.update_addr_and_error_state(
+                                addr, error_state, mangled_addr, err_fourier)
+
+                        data_mag.record_done(self.queue, 'compute')
+                        data_ma.record_done(self.queue, 'compute')
+                        cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
+                                               src=error_state.data.ptr,
+                                               size=err_fourier.nbytes, 
+                                               kind=3, # d2d
+                                               stream=self.queue.ptr)
+                        if use_tiles:
+                            s1 = prep.addr_gpu.shape[0] * \
+                                prep.addr_gpu.shape[1]
+                            s2 = prep.addr_gpu.shape[2] * \
+                                prep.addr_gpu.shape[3]
+                            TK.transpose(prep.addr_gpu.reshape(
+                                s1, s2), prep.addr2_gpu.reshape(s2, s1))
+
+            self.curiter += 1
+            self.queue.synchronize()
+
+        for name, s in self.ob.S.items():
+            cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
+        for name, s in self.pr.S.items():
+            cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
+
+        self.queue.synchronize()
+        
+        # costly but needed to sync back with
+        # for name, s in self.ex.S.items():
+        #     s.data[:] = s.gpu.get()
+        for dID, prep in self.diff_info.items():
+            err_fourier = prep.err_fourier_gpu.get()
+            err_phot = prep.err_phot_gpu.get()
+            err_exit = prep.err_exit_gpu.get()
+            errs = np.ascontiguousarray(
+                np.vstack([err_fourier, err_phot, err_exit]).T)
+            error.update(zip(prep.view_IDs, errs))
+
+        self.error = error
+        return error
+
+    # probe update
+    def probe_update(self, MPI=False):
+        queue = self.queue
+        use_atomics = self.p.probe_update_cuda_atomics
+        # storage for-loop
+        change_gpu = cp.zeros((1,), dtype=np.float32)
+        for pID, pr in self.pr.storages.items():
+            prn = self.pr_nrm.S[pID]
+            cfact = self.pr_cfact[pID]
+            with queue:
+                pr.gpu *= np.float32(cfact)
+                prn.gpu.fill(np.float32(cfact))
+
+        for iblock, dID in enumerate(self.dID_list):
+            prep = self.diff_info[dID]
+
+            POK = self.kernels[prep.label].POK
+            # find probe, object in exit ID in dependence of dID
+            pID, oID, eID = prep.poe_IDs
+
+            ev, ex, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod)
+            self.queue.wait_event(ev)
+
+            addrt = prep.addr_gpu if use_atomics else prep.addr2_gpu
+            ev = POK.pr_update(addrt,
+                               self.pr.S[pID].gpu,
+                               self.pr_nrm.S[pID].gpu,
+                               self.ob.S[oID].gpu,
+                               ex,
+                               atomics=use_atomics)
+
+            data_ex.record_done(self.queue, 'compute')
+            if iblock + len(self.ex_data) < len(self.dID_list):
+                data_ex.from_gpu(self.qu_dtoh)
+
+        self.dID_list.reverse()
+
+        self.queue.synchronize()
+        self.queue.use()
+        for pID, pr in self.pr.storages.items():
+
+            buf = self.pr_buf.S[pID]
+            prn = self.pr_nrm.S[pID]
+
+            self.multigpu.allReduceSum(pr.gpu)
+            self.multigpu.allReduceSum(prn.gpu)
+            pr.gpu /= prn.gpu
+            self.support_constraint(pr)
+
+            # calculate change on GPU
+            AUK = self.kernels[list(self.kernels)[0]].AUK
+            buf.gpu -= pr.gpu
+            change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu))
+            buf.gpu[:] = pr.gpu
+            self.multigpu.allReduceSum(change_gpu)
+            change = change_gpu.get().item() / parallel.size
+
+        return np.sqrt(change)
+
+    def engine_finalize(self):
+        """
+        Clear all GPU data, pinned memory, etc
+        """
+        self.ex_data = None
+        self.ma_data = None
+        self.mag_data = None
+
+        super().engine_finalize()
+
+        log_device_memory_stats(4)
+
+@register(name="DM_cupy")
+class DM_cupy_stream(_ProjectionEngine_cupy_stream, DMMixin):
+    """
+    A full-fledged Difference Map engine accelerated with cupy.
+
+    Defaults:
+
+    [name]
+    default = DM_cupy
+    type = str
+    help =
+    doc =
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        _ProjectionEngine_cupy_stream.__init__(self, ptycho_parent, pars)
+        DMMixin.__init__(self, self.p.alpha)
+        ptycho_parent.citations.add_article(**self.article)
+
+
+@register(name="RAAR_cupy")
+class RAAR_cupy_stream(_ProjectionEngine_cupy_stream, RAARMixin):
+    """
+    A RAAR engine in accelerated with cupy.
+
+    Defaults:
+
+    [name]
+    default = RAAR_cupy
+    type = str
+    help =
+    doc =
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+
+        _ProjectionEngine_cupy_stream.__init__(self, ptycho_parent, pars)
+        RAARMixin.__init__(self, self.p.beta)
diff --git a/ptypy/accelerate/cuda_cupy/engines/stochastic.py b/ptypy/accelerate/cuda_cupy/engines/stochastic.py
new file mode 100644
index 000000000..8af49d635
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/engines/stochastic.py
@@ -0,0 +1,550 @@
+# -*- coding: utf-8 -*-
+"""
+Accelerated stochastic reconstruction engine.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+
+import numpy as np
+import time
+import cupy as cp
+import cupyx
+
+from ptypy import utils as u
+from ptypy.utils.verbose import logger, log
+from ptypy.utils import parallel
+from ptypy.engines import register
+from ptypy.engines.stochastic import EPIEMixin, SDRMixin
+from ptypy.accelerate.base.engines.stochastic import _StochasticEngineSerial
+from ptypy.accelerate.base import address_manglers
+from .. import get_context
+from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel,\
+    PositionCorrectionKernel, PropagationKernel
+from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\
+    TransposeKernel, MaxAbs2Kernel, MassCenterKernel, Abs2SumKernel,\
+    InterpolatedShiftKernel
+from ..mem_utils import make_pagelocked_paired_arrays as mppa
+from ..mem_utils import GpuDataManager
+
+MPI = False
+
+EX_MA_BLOCKS_RATIO = 2
+# can be used to limit the number of blocks, simulating that they don't fit
+MAX_BLOCKS = 99999
+# MAX_BLOCKS = 10  # can be used to limit the number of blocks, simulating that they don't fit
+
+
+class _StochasticEngineCupy(_StochasticEngineSerial):
+
+    """
+    An accelerated implementation of a stochastic algorithm for ptychography
+
+    Defaults:
+
+    [fft_lib]
+    default = cuda
+    type = str
+    help = Choose the cupy-compatible FFT module.
+    doc = One of:
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'cupy'`` : cupy's cufft wrapper (fast load, slowest compute due to additional store/load stages)
+    choices = 'cuda','cupy'
+    userlevel = 2
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Accelerated base engine for stochastic algorithms.
+        """
+        super().__init__(ptycho_parent, pars)
+        self.ma_data = None
+        self.mag_data = None
+        self.ex_data = None
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        self.queue = get_context(new_queue=True)
+
+        # initialise kernels for centring probe if required
+        if self.p.probe_center_tol is not None:
+            # mass center kernel
+            self.MCK = MassCenterKernel(queue=self.queue)
+            # absolute sum kernel
+            self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue)
+            # interpolated shift kernel
+            self.ISK = InterpolatedShiftKernel(queue=self.queue)
+
+        super().engine_initialize()
+        self.qu_htod = cp.cuda.Stream()
+        self.qu_dtoh = cp.cuda.Stream()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        fpc = 0
+
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = max(scan.max_frames_per_block, fpc)
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                    scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = cp.asarray(aux)
+
+            # setup kernels, one for each SCAN.
+            log(4, "Setting up FourierUpdateKernel")
+            kern.FUK = FourierUpdateKernel(
+                aux, nmodes, queue_thread=self.queue)
+            kern.FUK.fshape = (1,) + kern.FUK.fshape[1:]
+            kern.FUK.allocate()
+
+            log(4, "Setting up PoUpdateKernel")
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            log(4, "Setting up AuxiliaryWaveKernel")
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            log(4, "Setting up ArrayUtilsKernel")
+            kern.AUK = ArrayUtilsKernel(queue=self.queue)
+
+            #log(4, "Setting up TransposeKernel")
+            #kern.TK = TransposeKernel(queue=self.queue)
+
+            log(4, "setting up MaxAbs2Kernel")
+            kern.MAK = MaxAbs2Kernel(queue=self.queue)
+
+            log(4, "Setting up PropagationKernel")
+            kern.PROP = PropagationKernel(
+                aux, geo.propagator, self.queue, self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                log(4, "Setting up position correction")
+                kern.PCK = PositionCorrectionKernel(
+                    aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK.allocate()
+
+        ex_mem = 0
+        mag_mem = 0
+        for scan, kern in self.kernels.items():
+            if kern.scanmodel in ("GradFull", "BlockGradFull"):
+                ex_mem = max(kern.aux.nbytes * 1, ex_mem)
+            else:
+                ex_mem = max(kern.aux.nbytes * fpc, ex_mem)
+            mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem)
+        ma_mem = mag_mem
+        mem = cp.cuda.runtime.memGetInfo()[0]
+        blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
+        # leave 200MB room for safety
+        fit = int(mem - 200 * 1024 * 1024) // blk
+        if not fit:
+            log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            self.context.pop()
+            self.context.detach()
+            raise SystemExit("ptypy has been exited.")
+
+        # TODO grow blocks dynamically
+        nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
+        nma = min(fit, MAX_BLOCKS)
+
+        log(3, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
+        # reset memory or create new
+        self.ex_data = GpuDataManager(ex_mem, 0, nex, True)
+        self.ma_data = GpuDataManager(ma_mem, 0, nma, False)
+        self.mag_data = GpuDataManager(mag_mem, 0, nma, False)
+        log(4, "Kernel setup completed")
+
+    def engine_prepare(self):
+        super().engine_prepare()
+
+        for name, s in self.ob.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu, s.data = mppa(s.data)
+
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.addr_gpu = cp.asarray(prep.addr)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+        for label, d in self.ptycho.new_data:
+            dID = d.ID
+            prep = self.diff_info[dID]
+            pID, oID, eID = prep.poe_IDs
+
+            prep.ma_sum_gpu = cp.asarray(prep.ma_sum)
+            prep.err_fourier_gpu = cp.asarray(prep.err_fourier)
+            prep.err_phot_gpu = cp.asarray(prep.err_phot)
+            prep.err_exit_gpu = cp.asarray(prep.err_exit)
+            if self.do_position_refinement:
+                prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu)
+            prep.obn = cp.asarray(prep.obn)
+            prep.prn = cp.asarray(prep.prn)
+            # prepare page-locked mems:
+            ma = self.ma.S[dID].data.astype(np.float32)
+            prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C")
+            prep.ma[:] = ma
+            ex = self.ex.S[eID].data
+            prep.ex = cupyx.empty_pinned(ex.shape, ex.dtype, order="C")
+            prep.ex[:] = ex
+            mag = prep.mag
+            prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C")
+            prep.mag[:] = mag
+
+            self.ex_data.add_data_block()
+            self.ma_data.add_data_block()
+            self.mag_data.add_data_block()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        self.dID_list = list(self.di.S.keys())
+        error = {}
+        for it in range(num):
+
+            for iblock, dID in enumerate(self.dID_list):
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                POK = kern.POK
+                MAK = kern.MAK
+                PROP = kern.PROP
+
+                # get aux buffer
+                aux = kern.aux
+
+                # local references
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+
+                # shuffle view order
+                vieworder = prep.vieworder
+                prep.rng.shuffle(vieworder)
+
+                # Schedule ex, ma, mag to device
+                ev_ex, ex_full, data_ex = self.ex_data.to_gpu(
+                    prep.ex, dID, self.qu_htod)
+                ev_mag, mag_full, data_mag = self.mag_data.to_gpu(
+                    prep.mag, dID, self.qu_htod)
+                ev_ma, ma_full, data_ma = self.ma_data.to_gpu(
+                    prep.ma, dID, self.qu_htod)
+
+                # Reference to ex, ma and mag
+                prep.ex_full = ex_full
+                prep.mag_full = mag_full
+                prep.ma_full = ma_full
+
+                # synchronize h2d stream with compute stream
+                self.queue.wait_event(ev_ex)
+
+                # Iterate through views
+                for i in vieworder:
+
+                    # Get local adress and arrays
+                    addr = prep.addr_gpu[i, None]
+                    ex_from, ex_to = prep.addr_ex[i]
+                    ex = prep.ex_full[ex_from:ex_to]
+                    mag = prep.mag_full[i, None]
+                    ma = prep.ma_full[i, None]
+                    ma_sum = prep.ma_sum_gpu[i, None]
+                    obn = prep.obn
+                    prn = prep.prn
+                    err_phot = prep.err_phot_gpu[i, None]
+                    err_fourier = prep.err_fourier_gpu[i, None]
+                    err_exit = prep.err_exit_gpu[i, None]
+
+                    # position update
+                    self.position_update_local(prep, i)
+
+                    # build auxilliary wave
+                    AWK.make_aux(aux, addr, ob, pr, ex,
+                                 c_po=self._c, c_e=1-self._c)
+
+                    # forward FFT
+                    PROP.fw(aux, aux)
+
+                    # Deviation from measured data
+                    self.queue.wait_event(ev_mag)
+                    if self.p.compute_fourier_error:
+                        self.queue.wait_event(ev_ma)
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                    else:
+                        FUK.fourier_deviation(aux, addr, mag)
+                        self.queue.wait_event(ev_ma)
+                    FUK.fmag_update_nopbound(aux, addr, mag, ma)
+
+                    # backward FFT
+                    PROP.bw(aux, aux)
+
+                    # build exit wave
+                    AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b,
+                                  c_po=self._a, c_e=-(self._a + self._b))
+                    if self.p.compute_exit_error:
+                        FUK.exit_error(aux, addr)
+                        FUK.error_reduce(addr, err_exit)
+
+                    # build auxilliary wave (ob * pr product)
+                    AWK.build_aux2_no_ex(aux, addr, ob, pr)
+
+                    # object update
+                    POK.pr_norm_local(addr, pr, prn)
+                    POK.ob_update_local(
+                        addr, ob, pr, ex, aux, prn, a=self._ob_a, b=self._ob_b)
+
+                    # probe update
+                    if self._object_norm_is_global and self._pr_a == 0:
+                        obn_max = cp.empty((1,), dtype=np.float32)
+                        MAK.max_abs2(ob, obn_max)
+                        obn.fill(np.float32(0.), stream=self.queue)
+                    else:
+                        POK.ob_norm_local(addr, ob, obn)
+                        obn_max = cp.max(obn, stream=self.queue)
+                    if self.p.probe_update_start <= self.curiter:
+                        POK.pr_update_local(
+                            addr, pr, ob, ex, aux, obn, obn_max, a=self._pr_a, b=self._pr_b)
+
+                    # compute log-likelihood
+                    if self.p.compute_log_likelihood:
+                        PROP.fw(aux, aux)
+                        FUK.log_likelihood2(aux, addr, mag, ma, err_phot)
+
+                data_ex.record_done(self.queue, 'compute')
+                if iblock + len(self.ex_data) < len(self.dID_list):
+                    data_ex.from_gpu(self.qu_dtoh)
+
+            # swap direction
+            self.dID_list.reverse()
+
+            # Re-center probe
+            self.center_probe()
+
+            self.curiter += 1
+            self.ex_data.syncback = False
+
+        # finish all the compute
+        self.queue.synchronize()
+
+        for name, s in self.ob.S.items():
+            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+        for name, s in self.pr.S.items():
+            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+
+        for dID, prep in self.diff_info.items():
+            err_fourier = prep.err_fourier_gpu.get()
+            err_phot = prep.err_phot_gpu.get()
+            err_exit = prep.err_exit_gpu.get()
+            errs = np.ascontiguousarray(
+                np.vstack([err_fourier, err_phot, err_exit]).T)
+            error.update(zip(prep.view_IDs, errs))
+
+        # wait for the async transfers
+        self.qu_dtoh.synchronize()
+
+        self.error = error
+        return error
+
+    def position_update_local(self, prep, i):
+        if not self.do_position_refinement:
+            return
+        do_update_pos = (self.p.position_refinement.stop >
+                         self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter %
+                          self.p.position_refinement.interval) == 0
+
+        # Update positions
+        if do_update_pos:
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            #log(4, "----------- START POS REF -------------")
+            pID, oID, eID = prep.poe_IDs
+            mag = prep.mag_full[i, None]
+            ma = prep.ma_full[i, None]
+            ma_sum = prep.ma_sum_gpu[i, None]
+            ob = self.ob.S[oID].gpu
+            pr = self.pr.S[pID].gpu
+            kern = self.kernels[prep.label]
+            aux = kern.aux
+            addr = prep.addr_gpu[i, None]
+            mangled_addr = prep.mangled_addr_gpu[i, None]
+            err_fourier = prep.err_fourier_gpu[i, None]
+            error_state = prep.error_state_gpu[i, None]
+
+            PCK = kern.PCK
+            PROP = kern.PROP
+
+            # Keep track of object boundaries
+            max_oby = ob.shape[-2] - aux.shape[-2] - 1
+            max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+            # We need to re-calculate the current error
+            PCK.build_aux(aux, addr, ob, pr)
+            PROP.fw(aux, aux)
+            # self.queue.wait_event(ev_mag)
+            # self.queue.wait_event(ev_ma)
+
+            if self.p.position_refinement.metric == "fourier":
+                PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                PCK.error_reduce(addr, err_fourier)
+            if self.p.position_refinement.metric == "photon":
+                PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+            cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr,
+                                        src=err_fourier.data.ptr,
+                                        size=err_fourier.nbytes,
+                                        stream=self.queue.ptr,
+                                        kind=3)  # d2d
+
+            PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+            #log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+            for i in range(PCK.mangler.nshifts):
+                PCK.mangler.get_address(
+                    i, addr, mangled_addr, max_oby, max_obx)
+                PCK.build_aux(aux, mangled_addr, ob, pr)
+                PROP.fw(aux, aux)
+                if self.p.position_refinement.metric == "fourier":
+                    PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                    PCK.error_reduce(mangled_addr, err_fourier)
+                if self.p.position_refinement.metric == "photon":
+                    PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                PCK.update_addr_and_error_state(
+                    addr, error_state, mangled_addr, err_fourier)
+
+            cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
+                                        src=error_state.data.ptr,
+                                        size=err_fourier.nbytes,
+                                        stream=self.queue.ptr,
+                                        kind=3)  # d2d
+
+    def center_probe(self):
+        if self.p.probe_center_tol is not None:
+            for name, pr_s in self.pr.storages.items():
+                psum_d = self.A2SK.abs2sum(pr_s.gpu)
+                c1 = self.MCK.mass_center(psum_d).get()
+                c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype)
+
+                shift = c2 - c1
+                # exit if the current center of mass is within the tolerance
+                if u.norm(shift) < self.p.probe_center_tol:
+                    break
+
+                # shift the probe
+                pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift)
+
+                # shift the object
+                ob_s = pr_s.views[0].pod.ob_view.storage
+                ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift)
+
+                # shift the exit waves
+                for dID in self.di.S.keys():
+                    prep = self.diff_info[dID]
+                    pID, oID, eID = prep.poe_IDs
+                    if pID == name:
+                        prep.ex_full = self.ISK.interpolate_shift(prep.ex_full,
+                                                                  shift)
+
+                log(4, 'Probe recentered from %s to %s'
+                    % (str(tuple(c1)), str(tuple(c2))))
+
+    def engine_finalize(self):
+        """
+        clear GPU data and destroy context.
+        """
+        self.ex_data = None
+        self.ma_data = None
+        self.mag_data = None
+
+        for name, s in self.ob.S.items():
+            del s.gpu
+        for name, s in self.pr.S.items():
+            del s.gpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+
+        # copy data to cpu
+        # this kills the pagelock memory (otherwise we get segfaults in h5py)
+        for name, s in self.pr.S.items():
+            s.data = np.copy(s.data)
+        for name, s in self.ob.S.items():
+            s.data = np.copy(s.data)
+
+        self.context.detach()
+        super().engine_finalize()
+
+
+@register()
+class EPIE_cupy(_StochasticEngineCupy, EPIEMixin):
+    """
+    An accelerated implementation of the EPIE algorithm.
+
+    Defaults:
+
+    [name]
+    default = EPIE_cupy
+    type = str
+    help =
+    doc =
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        _StochasticEngineCupy.__init__(self, ptycho_parent, pars)
+        EPIEMixin.__init__(self, self.p.alpha, self.p.beta)
+        ptycho_parent.citations.add_article(**self.article)
+
+
+@register()
+class SDR_cupy(_StochasticEngineCupy, SDRMixin):
+    """
+    An accelerated implementation of the semi-implicit relaxed Douglas-Rachford (SDR) algorithm.
+
+    Defaults:
+
+    [name]
+    default = SDR_cupy
+    type = str
+    help =
+    doc =
+
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        _StochasticEngineCupy.__init__(self, ptycho_parent, pars)
+        SDRMixin.__init__(self, self.p.sigma, self.p.tau,
+                          self.p.beta_probe, self.p.beta_object)
+        ptycho_parent.citations.add_article(**self.article)
diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py
new file mode 100644
index 000000000..53c012076
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/kernels.py
@@ -0,0 +1,1345 @@
+import numpy as np
+from ..base import kernels as ab
+from . import load_kernel
+import cupy as cp
+from ..base.kernels import Adict
+from inspect import getfullargspec
+from .array_utils import MaxAbs2Kernel, CropPadKernel
+from ptypy.utils.verbose import logger
+
+
+# fourier support
+def choose_fft(arr_shape, fft_type=None):
+    dims_are_powers_of_two = True
+    rows = arr_shape[0]
+    columns = arr_shape[1]
+    if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
+        dims_are_powers_of_two = False
+    if dims_are_powers_of_two:
+        try:
+            from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda as FFT
+        except:
+            logger.info(
+                'Unable to import optimised cufft version - using cufft with separte callbacks instead')
+            from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT
+    else:
+        logger.info(
+            'cufft: array dimensions are not powers of two (16 to 2048) - using cufft with separated callbacks')
+        from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT
+    return FFT
+
+
+class PropagationKernel:
+
+    def __init__(self, aux, propagator, queue_thread=None, fft_type='cuda'):
+        self.aux = aux
+        self._queue = queue_thread
+        self.prop_type = propagator.p.propagation
+        self.fw = None
+        self.bw = None
+        self._fft1 = None
+        self._fft2 = None
+        self._p = propagator
+        self.fft_type = fft_type
+
+    def allocate(self):
+
+        aux = self.aux
+        FFT = choose_fft(aux.shape[-2:], self.fft_type)
+
+        if self.prop_type == 'farfield':
+
+            self._do_crop_pad = (self._p.crop_pad != 0).any()
+            if self._do_crop_pad:
+                aux_shape = tuple(np.array(aux.shape) +
+                                  np.append([0], self._p.crop_pad))
+                self._tmp = np.zeros(aux_shape, dtype=aux.dtype)
+                self._CPK = CropPadKernel(queue=self._queue)
+            else:
+                self._tmp = aux
+
+            self._fft1 = FFT(self._tmp, self.queue,
+                             pre_fft=self._p.pre_fft,
+                             post_fft=self._p.post_fft,
+                             symmetric=True,
+                             forward=True)
+            self._fft2 = FFT(self._tmp, self.queue,
+                             pre_fft=self._p.pre_ifft,
+                             post_fft=self._p.post_ifft,
+                             symmetric=True,
+                             forward=False)
+            if self._do_crop_pad:
+                self._tmp = cp.asarray(self._tmp)
+
+            def _fw(x, y):
+                if self._do_crop_pad:
+                    self._CPK.crop_pad_2d_simple(self._tmp, x)
+                    self._fft1.ft(self._tmp, self._tmp)
+                    self._CPK.crop_pad_2d_simple(y, self._tmp)
+                else:
+                    self._fft1.ft(x, y)
+
+            def _bw(x, y):
+                if self._do_crop_pad:
+                    self._CPK.crop_pad_2d_simple(self._tmp, x)
+                    self._fft2.ift(self._tmp, self._tmp)
+                    self._CPK.crop_pad_2d_simple(y, self._tmp)
+                else:
+                    self._fft2.ift(x, y)
+
+            self.fw = _fw
+            self.bw = _bw
+
+        elif self.prop_type == "nearfield":
+            self._fft1 = FFT(aux, self.queue,
+                             post_fft=self._p.kernel,
+                             symmetric=True,
+                             forward=True)
+            self._fft2 = FFT(aux, self.queue,
+                             post_fft=self._p.ikernel,
+                             inplace=True,
+                             symmetric=True,
+                             forward=True)
+            self._fft3 = FFT(aux, self.queue,
+                             symmetric=True,
+                             forward=False)
+
+            def _fw(x, y):
+                self._fft1.ft(x, y)
+                self._fft3.ift(y, y)
+
+            def _bw(x, y):
+                self._fft2.ft(x, y)
+                self._fft3.ift(y, y)
+
+            self.fw = _fw
+            self.bw = _bw
+        else:
+            logger.warning(
+                "Unable to select propagator %s, only nearfield and farfield are supported" % self.prop_type)
+
+    @property
+    def queue(self):
+        return self._queue
+
+    @queue.setter
+    def queue(self, queue):
+        self._queue = queue
+        self._fft1.queue = queue
+        self._fft2.queue = queue
+        if self.prop_type == "nearfield":
+            self._fft3.queue = queue
+
+
+class FourierSupportKernel:
+    def __init__(self, support, queue_thread=None):
+        self.support = support
+        self.queue = queue_thread
+
+    def allocate(self):
+        FFT = choose_fft(self.support.shape[-2:])
+
+        self._fft1 = FFT(self.support, self.queue,
+                         post_fft=self.support,
+                         symmetric=True,
+                         forward=True)
+        self._fft2 = FFT(self.support, self.queue,
+                         symmetric=True,
+                         forward=False)
+
+    def apply_fourier_support(self, x):
+        self._fft1.ft(x, x)
+        self._fft2.ift(x, x)
+
+
+class RealSupportKernel:
+    def __init__(self, support, queue=None):
+        self.queue = queue
+        self.support = support
+
+    def allocate(self):
+        if self.queue is not None:
+            self.queue.use()
+        self.support = cp.asarray(self.support)
+
+    def apply_real_support(self, x):
+        if self.queue is not None:
+            self.queue.use()
+        x *= self.support
+
+
+class FourierUpdateKernel(ab.FourierUpdateKernel):
+
+    def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'):
+        super(FourierUpdateKernel, self).__init__(aux,  nmodes=nmodes)
+
+        if accumulate_type not in ['float', 'double']:
+            raise ValueError('Only float or double types are supported')
+        if math_type not in ['float', 'double']:
+            raise ValueError('Only float or double types are supported')
+        self.accumulate_type = accumulate_type
+        self.math_type = math_type
+        self.queue = queue_thread
+        self.fmag_all_update_cuda = load_kernel("fmag_all_update", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.fmag_update_nopbound_cuda = None
+        self.fourier_deviation_cuda = None
+        self.fourier_error_cuda = load_kernel("fourier_error", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.fourier_error2_cuda = None
+        self.error_reduce_cuda = load_kernel("error_reduce", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'ACC_TYPE': self.accumulate_type,
+            'BDIM_X': 32,
+            'BDIM_Y': 32,
+        })
+        self.fourier_update_cuda = None
+        self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel(
+            ("log_likelihood", "log_likelihood2"), {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            },
+            "log_likelihood.cu")
+        self.exit_error_cuda = load_kernel("exit_error", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+
+        self.gpu = Adict()
+        self.gpu.fdev = None
+        self.gpu.ferr = None
+
+    def allocate(self):
+        self.gpu.fdev = cp.zeros(self.fshape, dtype=np.float32)
+        self.gpu.ferr = cp.zeros(self.fshape, dtype=np.float32)
+
+    def fourier_error(self, f, addr, fmag, fmask, mask_sum):
+        fdev = self.gpu.fdev
+        ferr = self.gpu.ferr
+        if self.queue is not None:
+            self.queue.use()
+        if True:
+            # version going over all modes in a single thread (faster)
+            self.fourier_error_cuda(grid=(int(fmag.shape[0]), 1, 1),
+                                    block=(32, 32, 1),
+                                    args=(np.int32(self.nmodes),
+                                          f,
+                                          fmask,
+                                          fmag,
+                                          fdev,
+                                          ferr,
+                                          mask_sum,
+                                          addr,
+                                          np.int32(self.fshape[1]),
+                                          np.int32(self.fshape[2])))
+        else:
+            # version using one thread per mode + shared mem reduction (slower)
+            if self.fourier_error2_cuda is None:
+                self.fourier_error2_cuda = load_kernel("fourier_error2")
+            bx = 16
+            by = 16
+            bz = int(self.nmodes)
+            blk = (bx, by, bz)
+            grd = (int((self.fshape[2] + bx-1) // bx),
+                   int((self.fshape[1] + by-1) // by),
+                   int(self.fshape[0]))
+            # print('block={}, grid={}, fshape={}'.format(blk, grd, self.fshape))
+            self.fourier_error2_cuda(grid=grd,
+                                     block=blk,
+                                     args=(np.int32(self.nmodes),
+                                           f,
+                                           fmask,
+                                           fmag,
+                                           fdev,
+                                           ferr,
+                                           mask_sum,
+                                           addr,
+                                           np.int32(self.fshape[1]),
+                                           np.int32(self.fshape[2])),
+                                     shared_mem=int(bx*by*bz*4))
+
+    def fourier_deviation(self, f, addr, fmag):
+        fdev = self.gpu.fdev
+        if self.fourier_deviation_cuda is None:
+            self.fourier_deviation_cuda = load_kernel("fourier_deviation", {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            })
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.fourier_deviation_cuda(grid=(
+            1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])),
+            block=(bx, by, 1),
+            args=(np.int32(self.nmodes),
+                  f,
+                  fmag,
+                  fdev,
+                  addr,
+                  np.int32(self.fshape[1]),
+                  np.int32(self.fshape[2])))
+
+    def error_reduce(self, addr, err_sum):
+        if self.queue is not None:
+            self.queue.use()
+        self.error_reduce_cuda(grid=(int(err_sum.shape[0]), 1, 1),
+                               block=(32, 32, 1),
+                               args=(self.gpu.ferr,
+                               err_sum,
+                               np.int32(self.fshape[1]),
+                               np.int32(self.fshape[2])))
+
+    def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0):
+        fdev = self.gpu.fdev
+        if self.queue is not None:
+            self.queue.use()
+        self.fmag_all_update_cuda(grid=(int(fmag.shape[0]*self.nmodes), 1, 1),
+                                  block=(32, 32, 1),
+                                  args=(f,
+                                  fmask,
+                                  fmag,
+                                  fdev,
+                                  err_fmag,
+                                  addr,
+                                  np.float32(pbound),
+                                  np.int32(self.fshape[1]),
+                                  np.int32(self.fshape[2])))
+
+    def fmag_update_nopbound(self, f, addr, fmag, fmask):
+        fdev = self.gpu.fdev
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        if self.fmag_update_nopbound_cuda is None:
+            self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            })
+        self.fmag_update_nopbound_cuda(grid=(1,
+                                             int((
+                                                 self.fshape[2] + by - 1) // by),
+                                             int(fmag.shape[0]*self.nmodes)),
+                                       block=(bx, by, 1),
+                                       args=(f,
+                                             fmask,
+                                             fmag,
+                                             fdev,
+                                             addr,
+                                             np.int32(self.fshape[1]),
+                                             np.int32(self.fshape[2])))
+
+    # Note: this was a test to join the kernels, but it's > 2x slower!
+    def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0):
+        if self.fourier_update_cuda is None:
+            self.fourier_update_cuda = load_kernel("fourier_update")
+        if self.queue is not None:
+            self.queue.use()
+
+        fdev = self.gpu.fdev
+        ferr = self.gpu.ferr
+
+        bx = 16
+        by = 16
+        bz = int(self.nmodes)
+        blk = (bx, by, bz)
+        grd = (int((self.fshape[2] + bx-1) // bx),
+               int((self.fshape[1] + by-1) // by),
+               int(self.fshape[0]))
+        smem = int(bx*by*bz*4)
+        self.fourier_update_cuda(grid=grd,
+                                 block=blk,
+                                 args=(np.int32(self.nmodes),
+                                       f,
+                                       fmask,
+                                       fmag,
+                                       fdev,
+                                       ferr,
+                                       mask_sum,
+                                       addr,
+                                       err_fmag,
+                                       np.float32(pbound),
+                                       np.int32(self.fshape[1]),
+                                       np.int32(self.fshape[2])),
+                                 shared_mem=smem)
+
+    def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
+        ferr = self.gpu.ferr
+        if self.queue is not None:
+            self.queue.use()
+        self.log_likelihood_cuda(grid=(int(mag.shape[0]), 1, 1),
+                                 block=(32, 32, 1),
+                                 args=(np.int32(self.nmodes),
+                                       b_aux,
+                                       mask,
+                                       mag,
+                                       addr,
+                                       ferr,
+                                       np.int32(self.fshape[1]),
+                                       np.int32(self.fshape[2])))
+        # TODO: we might want to move this call outside of here
+        self.error_reduce(addr, err_phot)
+
+    def log_likelihood2(self, b_aux, addr, mag, mask, err_phot):
+        ferr = self.gpu.ferr
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.log_likelihood2_cuda(grid=(
+            1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])),
+            block=(bx, by, 1),
+            args=(np.int32(self.nmodes),
+                  b_aux,
+                  mask,
+                  mag,
+                  addr,
+                  ferr,
+                  np.int32(self.fshape[1]),
+                  np.int32(self.fshape[2])))
+        # TODO: we might want to move this call outside of here
+        self.error_reduce(addr, err_phot)
+
+    def exit_error(self, aux, addr):
+        sh = addr.shape
+        maxz = sh[0]
+        ferr = self.gpu.ferr
+        if self.queue is not None:
+            self.queue.use()
+        self.exit_error_cuda(grid=(int(maxz), 1, 1),
+                             block=(32, 32, 1),
+                             args=(np.int32(self.nmodes),
+                                   aux,
+                                   ferr,
+                                   addr,
+                                   np.int32(self.fshape[1]),
+                                   np.int32(self.fshape[2])))
+
+
+class AuxiliaryWaveKernel(ab.AuxiliaryWaveKernel):
+
+    def __init__(self, queue_thread=None, math_type='float'):
+        super(AuxiliaryWaveKernel, self).__init__()
+        # and now initialise the cuda
+        self.queue = queue_thread
+        self._ob_shape = None
+        self._ob_id = None
+        self.math_type = math_type
+        if math_type not in ['float', 'double']:
+            raise ValueError('Only double or float math is supported')
+        self.make_aux_cuda, self.make_aux2_cuda = load_kernel(
+            ("make_aux", "make_aux2"), {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            }, "make_aux.cu")
+        self.make_exit_cuda = load_kernel("make_exit", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.build_aux_no_ex_cuda, self.build_aux2_no_ex_cuda = load_kernel(
+            ("build_aux_no_ex", "build_aux2_no_ex"), {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            }, "build_aux_no_ex.cu")
+        # self.build_exit_alpha_tau_cuda = load_kernel("build_exit_alpha_tau", {
+        #     'IN_TYPE': 'float',
+        #     'OUT_TYPE': 'float',
+        #     'MATH_TYPE': self.math_type
+        # })
+
+    # DEPRECATED?
+    def load(self, aux, ob, pr, ex, addr):
+        super(AuxiliaryWaveKernel, self).load(aux, ob, pr, ex, addr)
+        for key, array in self.npy.__dict__.items():
+            self.ocl.__dict__[key] = cp.to_gpu(array)
+
+    def make_aux(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        if self.queue is not None:
+            self.queue.use()
+        self.make_aux_cuda(grid=(int(maxz * nmodes), 1, 1),
+                           block=(32, 32, 1),
+                           args=(b_aux,
+                                 ex,
+                                 np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                 pr,
+                                 np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                 ob,
+                                 obr, obc,
+                                 addr,
+                                 np.float32(
+                                     c_po) if ex.dtype == np.complex64 else np.float64(c_po),
+                                 np.float32(c_e) if ex.dtype == np.complex64 else np.float64(c_e)))
+
+    def make_aux2(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.make_aux2_cuda(grid=(1,
+                                  int((ex.shape[1] + by - 1)//by),
+                                  int(maxz * nmodes)),
+                            block=(bx, by, 1),
+                            args=(b_aux,
+                                  ex,
+                                  np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                  pr,
+                                  np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                  ob,
+                                  obr, obc,
+                                  addr,
+                                  np.float32(
+                                      c_po) if ex.dtype == np.complex64 else np.float64(c_po),
+                                  np.float32(
+                                      c_e) if ex.dtype == np.complex64 else np.float64(c_e)))
+
+    def make_exit(self, b_aux, addr, ob, pr, ex, c_a=1.0, c_po=0.0, c_e=-1.0):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        if self.queue is not None:
+            self.queue.use()
+        self.make_exit_cuda(grid=(int(maxz * nmodes), 1, 1),
+                            block=(32, 32, 1),
+                            args=(b_aux,
+                                  ex,
+                                  np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                  pr,
+                                  np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                  ob,
+                                  obr, obc,
+                                  addr,
+                                  np.float32(
+                                      c_a) if ex.dtype == np.complex64 else np.float64(c_a),
+                                  np.float32(
+                                      c_po) if ex.dtype == np.complex64 else np.float64(c_po),
+                                  np.float32(
+                                      c_e) if ex.dtype == np.complex64 else np.float64(c_e)))
+
+    def build_aux2(self, b_aux, addr, ob, pr, ex, alpha=1.0):
+        # DM only, legacy. also make_aux2 does no exit in the parent
+        self.make_aux2(b_aux, addr, ob, pr, ex, 1.+alpha, -alpha)
+
+    """
+    def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.build_exit_alpha_tau_cuda(grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)),
+                                       block=(bx, by, 1),
+                                       args=(b_aux,
+                                       ex,
+                                       np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                       pr,
+                                       np.int32(ex.shape[1]), np.int32(ex.shape[2]),
+                                       ob,
+                                       obr, obc,
+                                       addr,
+                                       np.float32(alpha), np.float32(tau)))
+    """
+
+    def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        if self.queue is not None:
+            self.queue.use()
+        self.build_aux_no_ex_cuda(grid=(int(maxz * nmodes), 1, 1),
+                                  block=(32, 32, 1),
+                                  args=(b_aux,
+                                        np.int32(b_aux.shape[-2]),
+                                        np.int32(b_aux.shape[-1]),
+                                        pr,
+                                        np.int32(pr.shape[-2]),
+                                        np.int32(pr.shape[-1]),
+                                        ob,
+                                        obr, obc,
+                                        addr,
+                                        np.float32(
+                                            fac) if pr.dtype == np.complex64 else np.float64(fac),
+                                        np.int32(add)))
+
+    def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.build_aux2_no_ex_cuda(grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)),
+                                   block=(bx, by, 1),
+                                   args=(b_aux,
+                                         np.int32(b_aux.shape[-2]),
+                                         np.int32(b_aux.shape[-1]),
+                                         pr,
+                                         np.int32(pr.shape[-2]),
+                                         np.int32(pr.shape[-1]),
+                                         ob,
+                                         obr, obc,
+                                         addr,
+                                         np.float32(
+                                             fac) if pr.dtype == np.complex64 else np.float64(fac),
+                                         np.int32(add)))
+
+    def _cache_object_shape(self, ob):
+        oid = id(ob)
+
+        if not oid == self._ob_id:
+            self._ob_id = oid
+            self._ob_shape = (np.int32(ob.shape[-2]), np.int32(ob.shape[-1]))
+
+        return self._ob_shape
+
+
+class GradientDescentKernel(ab.GradientDescentKernel):
+
+    def __init__(self, aux, nmodes=1, queue=None, accumulate_type='double', math_type='float'):
+        super().__init__(aux, nmodes)
+        self.queue = queue
+        self.accumulate_type = accumulate_type
+        self.math_type = math_type
+        if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']):
+            raise ValueError(
+                "accumulate and math types must be double for float")
+
+        self.gpu = Adict()
+        self.gpu.LLden = None
+        self.gpu.LLerr = None
+        self.gpu.Imodel = None
+
+        subs = {
+            'IN_TYPE': 'float' if self.ftype == np.float32 else 'double',
+            'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double',
+            'ACC_TYPE': self.accumulate_type,
+            'MATH_TYPE': self.math_type
+        }
+        self.make_model_cuda = load_kernel('make_model', subs)
+        self.make_a012_cuda = load_kernel('make_a012', subs)
+        self.error_reduce_cuda = load_kernel('error_reduce', {
+            **subs,
+            'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double',
+            'BDIM_X': 32,
+            'BDIM_Y': 32
+        })
+        self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel(
+            ('fill_b', 'fill_b_reduce'),
+            {
+                **subs,
+                'BDIM_X': 1024,
+                'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double'
+            },
+            file="fill_b.cu")
+        self.main_cuda = load_kernel('gd_main', subs)
+        self.floating_intensity_cuda_step1, self.floating_intensity_cuda_step2 = \
+            load_kernel(('step1', 'step2'), subs, 'intens_renorm.cu')
+
+    def allocate(self):
+        self.gpu.LLden = cp.zeros(self.fshape, dtype=self.ftype)
+        self.gpu.LLerr = cp.zeros(self.fshape, dtype=self.ftype)
+        self.gpu.Imodel = cp.zeros(self.fshape, dtype=self.ftype)
+        tmp = np.ones((self.fshape[0],), dtype=self.ftype)
+        self.gpu.fic_tmp = cp.asarray(tmp)
+
+        # temporary array for the reduction in fill_b
+        sh = (3, int((np.prod(self.fshape)*self.nmodes + 1023) // 1024))
+        self.gpu.Btmp = cp.zeros(
+            sh, dtype=np.float64 if self.accumulate_type == 'double' else np.float32)
+
+    def make_model(self, b_aux, addr):
+        # reference shape
+        sh = self.fshape
+
+        # batch buffers
+        Imodel = self.gpu.Imodel
+        aux = b_aux
+
+        # dimensions / grid
+        z = np.int32(sh[0])
+        y = np.int32(self.nmodes)
+        x = np.int32(sh[1] * sh[2])
+        bx = 1024
+        if self.queue is not None:
+            self.queue.use()
+        self.make_model_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)),
+                             block=(bx, 1, 1),
+                             args=(aux, Imodel, z, y, x))
+
+    def make_a012(self, b_f, b_a, b_b, addr, I, fic):
+        # reference shape (= GPU global dims)
+        sh = I.shape
+
+        # stopper
+        maxz = I.shape[0]
+
+        A0 = self.gpu.Imodel
+        A1 = self.gpu.LLerr
+        A2 = self.gpu.LLden
+
+        z = np.int32(sh[0])
+        maxz = np.int32(maxz)
+        y = np.int32(self.nmodes)
+        x = np.int32(sh[1]*sh[2])
+        bx = 1024
+        if self.queue is not None:
+            self.queue.use()
+        self.make_a012_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)),
+                            block=(bx, 1, 1),
+                            args=(b_f, b_a, b_b, I, fic,
+                            A0, A1, A2, z, y, x, maxz))
+
+    def fill_b(self, addr, Brenorm, w, B):
+        # stopper
+        maxz = w.shape[0]
+
+        A0 = self.gpu.Imodel
+        A1 = self.gpu.LLerr
+        A2 = self.gpu.LLden
+
+        sz = np.int32(np.prod(w.shape))
+        blks = int((sz + 1023) // 1024)
+        # print('blocks={}, Btmp={}, fshape={}, wshape={}, modes={}'.format(blks, self.gpu.Btmp.shape, self.fshape, w.shape, self.nmodes))
+        assert self.gpu.Btmp.shape[1] >= blks
+        # 2-stage reduction - even if 1 block, as we have a += in second kernel
+        if self.queue is not None:
+            self.queue.use()
+        self.fill_b_cuda(grid=(blks, 1, 1),
+                         block=(1024, 1, 1),
+                         args=(A0, A1, A2, w,
+                         np.float32(Brenorm) if self.ftype == np.float32 else np.float64(
+                             Brenorm),
+                         sz, self.gpu.Btmp))
+        self.fill_b_reduce_cuda(grid=(1, 1, 1),
+                                block=(1024, 1, 1),
+                                args=(self.gpu.Btmp, B, np.int32(blks)))
+
+    def error_reduce(self, addr, err_sum):
+        # reference shape  (= GPU global dims)
+        sh = err_sum.shape
+
+        # stopper
+        maxz = err_sum.shape[0]
+
+        # batch buffers
+        ferr = self.gpu.LLerr
+
+        # print('maxz={}, ferr={}'.format(maxz, ferr.shape))
+        assert (maxz <= np.prod(ferr.shape[:-2]))
+
+        if self.queue is not None:
+            self.queue.use()
+
+        # Reduces the LL error along the last 2 dimensions.fd
+        self.error_reduce_cuda(grid=(int(maxz), 1, 1),
+                               block=(32, 32, 1),
+                               args=(ferr, err_sum,
+                                     np.int32(ferr.shape[-2]),
+                                     np.int32(ferr.shape[-1])))
+
+    def floating_intensity(self, addr, w, I, fic):
+
+        # reference shape  (= GPU global dims)
+        sh = I.shape
+
+        # stopper
+        maxz = I.shape[0]
+
+        # internal buffers
+        num = self.gpu.LLerr
+        den = self.gpu.LLden
+        Imodel = self.gpu.Imodel
+        fic_tmp = self.gpu.fic_tmp
+
+        ## math ##
+        xall = np.int32(maxz * sh[1] * sh[2])
+        bx = 1024
+
+        if self.queue is not None:
+            self.queue.use()
+
+        self.floating_intensity_cuda_step1(grid=(int((xall + bx - 1) // bx), 1, 1),
+                                           block=(bx, 1, 1),
+                                           args=(Imodel, I, w, num, den,
+                                                 xall))
+
+        self.error_reduce_cuda(grid=(int(maxz), 1, 1),
+                               block=(32, 32, 1),
+                               args=(num, fic,
+                                     np.int32(num.shape[-2]),
+                                     np.int32(num.shape[-1])))
+
+        self.error_reduce_cuda(grid=(int(maxz), 1, 1),
+                               block=(32, 32, 1),
+                               args=(den, fic_tmp,
+                                     np.int32(den.shape[-2]),
+                                     np.int32(den.shape[-1])))
+
+        self.floating_intensity_cuda_step2(grid=(1, 1, int(maxz)),
+                                           block=(32, 32, 1),
+                                           args=(fic_tmp, fic, Imodel,
+                                                 np.int32(Imodel.shape[-2]),
+                                                 np.int32(Imodel.shape[-1])))
+
+    def main(self, b_aux, addr, w, I):
+        nmodes = self.nmodes
+        # stopper
+        maxz = I.shape[0]
+
+        # batch buffers
+        err = self.gpu.LLerr
+        Imodel = self.gpu.Imodel
+        aux = b_aux
+
+        # write-to shape  (= GPU global dims)
+        ish = aux.shape
+
+        x = np.int32(ish[1] * ish[2])
+        y = np.int32(nmodes)
+        z = np.int32(maxz)
+        bx = 1024
+
+        if self.queue is not None:
+            self.queue.use()
+
+        # print(Imodel.dtype, I.dtype, w.dtype, err.dtype, aux.dtype, z, y, x)
+        self.main_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)),
+                       block=(bx, 1, 1),
+                       args=(Imodel, I, w, err, aux,
+                             z, y, x))
+
+
+class PoUpdateKernel(ab.PoUpdateKernel):
+
+    def __init__(self, queue_thread=None,
+                 math_type='float', accumulator_type='float'):
+        super(PoUpdateKernel, self).__init__()
+        # and now initialise the cuda
+        if math_type not in ['double', 'float']:
+            raise ValueError(
+                'only float and double are supported for math_type')
+        if accumulator_type not in ['double', 'float']:
+            raise ValueError(
+                'only float and double are supported for accumulator_type')
+        self.math_type = math_type
+        self.accumulator_type = accumulator_type
+        self.queue = queue_thread
+        self.norm = None
+        self.MAK = MaxAbs2Kernel(self.queue)
+        self.ob_update_cuda = load_kernel("ob_update", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.ob_update2_cuda = None  # load_kernel("ob_update2")
+        self.pr_update_cuda = load_kernel("pr_update", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.pr_update2_cuda = None
+        self.ob_update_ML_cuda = load_kernel("ob_update_ML", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.ob_update2_ML_cuda = None
+        self.pr_update_ML_cuda = load_kernel("pr_update_ML", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.pr_update2_ML_cuda = None
+        self.ob_update_local_cuda = load_kernel("ob_update_local", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.pr_update_local_cuda = load_kernel("pr_update_local", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.ob_norm_local_cuda = load_kernel("ob_norm_local", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.pr_norm_local_cuda = load_kernel("pr_norm_local", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+
+    def ob_update(self, addr, ob, obn, pr, ex, atomics=True):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        if obn.dtype != np.float32:
+            raise ValueError(
+                "Denominator must be float32 in current implementation")
+
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for atomics ob_update')
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.ob_update_cuda(grid=(int(num_pods), 1, 1),
+                                block=(32, 32, 1),
+                                args=(ex, num_pods, prsh[1], prsh[2],
+                                      pr, prsh[0], prsh[1], prsh[2],
+                                      ob, obsh[0], obsh[1], obsh[2],
+                                      addr,
+                                      obn))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled ob_update')
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.ob_update2_cuda:
+                self.ob_update2_cuda = load_kernel("ob_update2", {
+                    "NUM_MODES": obsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+
+            grid = [int((x+15)//16) for x in ob.shape[-2:]]
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_cuda(grid=grid,
+                                 block=(16, 16, 1),
+                                 args=(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1],
+                                       prsh[0],
+                                       np.int32(ex.shape[0]),
+                                       np.int32(ex.shape[1]),
+                                       np.int32(ex.shape[2]),
+                                       ob, obn, pr, ex, addr))
+
+    def pr_update(self, addr, pr, prn, ob, ex, atomics=True):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        if prn.dtype != np.float32:
+            raise ValueError(
+                "Denominator must be float32 in current implementation")
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for atomics pr_update')
+
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.pr_update_cuda(grid=(int(num_pods), 1, 1),
+                                block=(32, 32, 1),
+                                args=(ex, num_pods, prsh[1], prsh[2],
+                                      pr, prsh[0], prsh[1], prsh[2],
+                                      ob, obsh[0], obsh[1], obsh[2],
+                                      addr,
+                                      prn))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled pr_update')
+
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.pr_update2_cuda:
+                self.pr_update2_cuda = load_kernel("pr_update2", {
+                    "NUM_MODES": prsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+
+            grid = [int((x+15)//16) for x in pr.shape[-2:]]
+            grid = (grid[0], grid[1], int(1))
+            self.pr_update2_cuda(grid=grid,
+                                 block=(16, 16, 1),
+                                 args=(prsh[-1], obsh[-2], obsh[-1],
+                                       prsh[0], obsh[0], num_pods,
+                                       pr, prn, ob, ex, addr))
+
+    def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.ob_update_ML_cuda(grid=(int(num_pods), 1, 1),
+                                   block=(32, 32, 1),
+                                   args=(ex, num_pods, exsh[1], exsh[2],
+                                         pr, prsh[0], prsh[1], prsh[2],
+                                         ob, obsh[0], obsh[1], obsh[2],
+                                         addr,
+                                         np.float32(
+                                       fac) if ex.dtype == np.complex64 else np.float64(fac)))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled ob_update')
+
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.ob_update2_ML_cuda:
+                self.ob_update2_ML_cuda = load_kernel("ob_update2_ML", {
+                    "NUM_MODES": obsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+            grid = [int((x+15)//16) for x in ob.shape[-2:]]
+            grid = (grid[1], grid[0], int(1))
+            self.ob_update2_ML_cuda(grid=grid,
+                                    block=(16, 16, 1),
+                                    args=(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1],
+                                          prsh[0],
+                                          np.int32(ex.shape[0]),
+                                          np.int32(ex.shape[1]),
+                                          np.int32(ex.shape[2]),
+                                          ob, pr, ex, addr,
+                                          np.float32(
+                                        fac) if ex.dtype == np.complex64 else np.float64(fac)))
+
+    def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        if self.queue is not None:
+            self.queue.use()
+        if atomics:
+            if addr.shape[3] != 3 or addr.shape[2] != 5:
+                raise ValueError(
+                    'Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[0] * addr.shape[1])
+            self.pr_update_ML_cuda(grid=(int(num_pods), 1, 1),
+                                   block=(32, 32, 1),
+                                   args=(ex, num_pods, prsh[1], prsh[2],
+                                         pr, prsh[0], prsh[1], prsh[2],
+                                         ob, obsh[0], obsh[1], obsh[2],
+                                         addr,
+                                         np.float32(
+                                       fac) if ex.dtype == np.complex64 else np.float64(fac)))
+        else:
+            if addr.shape[0] != 5 or addr.shape[1] != 3:
+                raise ValueError(
+                    'Address not in required shape for tiled pr_update')
+            num_pods = np.int32(addr.shape[2] * addr.shape[3])
+            if not self.pr_update2_ML_cuda:
+                self.pr_update2_ML_cuda = load_kernel("pr_update2_ML", {
+                    "NUM_MODES": prsh[0],
+                    "BDIM_X": 16,
+                    "BDIM_Y": 16,
+                    'IN_TYPE': 'float',
+                    'OUT_TYPE': 'float',
+                    'MATH_TYPE': self.math_type,
+                    'ACC_TYPE': self.accumulator_type
+                })
+
+            grid = [int((x+15)//16) for x in pr.shape[-2:]]
+            grid = (grid[0], grid[1], int(1))
+            self.pr_update2_ML_cuda(grid=grid,
+                                    block=(16, 16, 1),
+                                    args=(prsh[-1], obsh[-2], obsh[-1],
+                                          prsh[0], obsh[0], num_pods,
+                                          pr, ob, ex, addr,
+                                          np.float32(
+                                        fac) if ex.dtype == np.complex64 else np.float64(fac)))
+
+    def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.):
+        if self.queue is not None:
+            self.queue.use()
+        prn_max = cp.max(prn)
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError(
+                'Address not in required shape for tiled ob_update')
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+        bx = 64
+        by = 1
+        self.ob_update_local_cuda(grid=(
+            1, int((exsh[1] + by - 1)//by), int(num_pods)),
+            block=(bx, by, 1),
+            args=(ex, aux,
+                  exsh[0], exsh[1], exsh[2],
+                  pr,
+                  prsh[0], prsh[1], prsh[2],
+                  prn,
+                  ob,
+                  obsh[0], obsh[1], obsh[2],
+                  addr,
+                  prn_max,
+                  np.float32(a),
+                  np.float32(b)))
+
+    def pr_update_local(self, addr, pr, ob, ex, aux, obn, obn_max, a=0., b=1.):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError(
+                'Address not in required shape for tiled pr_update')
+        if self.queue is not None:
+            self.queue.use()
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+        bx = 64
+        by = 1
+        self.pr_update_local_cuda(grid=(
+            1, int((exsh[1] + by - 1) // by), int(num_pods)),
+            block=(bx, by, 1),
+            args=(ex, aux,
+                  exsh[0], exsh[1], exsh[2],
+                  pr,
+                  prsh[0], prsh[1], prsh[2],
+                  obn,
+                  ob,
+                  obsh[0], obsh[1], obsh[2],
+                  addr,
+                  obn_max,
+                  np.float32(a),
+                  np.float32(b)))
+
+    def ob_norm_local(self, addr, ob, obn):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        obnsh = [np.int32(ax) for ax in obn.shape]
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.ob_norm_local_cuda(grid=(
+            1, int((obnsh[1] + by - 1)//by), int(obnsh[0])),
+            block=(bx, by, 1),
+            args=(obn,
+                  obnsh[0], obnsh[1], obnsh[2],
+                  ob,
+                  obsh[0], obsh[1], obsh[2],
+                  addr))
+
+    def pr_norm_local(self, addr, pr, prn):
+        prsh = [np.int32(ax) for ax in pr.shape]
+        prnsh = [np.int32(ax) for ax in prn.shape]
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.pr_norm_local_cuda(grid=(
+            1, int((prnsh[1] + by - 1)//by), int(prnsh[0])),
+            block=(bx, by, 1),
+            args=(prn,
+                  prnsh[0], prnsh[1], prnsh[2],
+                  pr,
+                  prsh[0], prsh[1], prsh[2],
+                  addr))
+
+
+class PositionCorrectionKernel(ab.PositionCorrectionKernel):
+    from ptypy.accelerate.cuda_cupy import address_manglers
+
+    # these are used by the self.setup method - replacing them with the GPU implementation
+    MANGLERS = {
+        'Annealing': address_manglers.RandomIntMangler,
+        'GridSearch': address_manglers.GridSearchMangler
+    }
+
+    def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type='float', **kwargs):
+        super(PositionCorrectionKernel, self).__init__(*args, **kwargs)
+        # make sure we set the right stream in the mangler
+        self.mangler.queue = queue_thread
+        if math_type not in ['float', 'double']:
+            raise ValueError('Only float or double math is supported')
+        if accumulate_type not in ['float', 'double']:
+            raise ValueError('Only float or double math is supported')
+
+        # add kernels
+        self.math_type = math_type
+        self.accumulate_type = accumulate_type
+        self.queue = queue_thread
+        self._ob_shape = None
+        self._ob_id = None
+        self.fourier_error_cuda = load_kernel("fourier_error", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.error_reduce_cuda = load_kernel("error_reduce", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'BDIM_X': 32,
+            'BDIM_Y': 32,
+            'ACC_TYPE': self.accumulate_type
+        })
+        self.log_likelihood_cuda, self.log_likelihood_ml_cuda = load_kernel(
+            ("log_likelihood", "log_likelihood_ml"), {
+                'IN_TYPE': 'float',
+                'OUT_TYPE': 'float',
+                'MATH_TYPE': self.math_type
+            }, "log_likelihood.cu")
+        self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type
+        })
+        self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float'
+        })
+
+        self.gpu = Adict()
+        self.gpu.fdev = None
+        self.gpu.ferr = None
+
+    def allocate(self):
+        self.gpu.fdev = cp.zeros(self.fshape, dtype=np.float32)
+        self.gpu.ferr = cp.zeros(self.fshape, dtype=np.float32)
+
+    def build_aux(self, b_aux, addr, ob, pr):
+        obr, obc = self._cache_object_shape(ob)
+        sh = addr.shape
+        nmodes = sh[1]
+        maxz = sh[0]
+        if self.queue is not None:
+            self.queue.use()
+        self.build_aux_pc_cuda(grid=(int(maxz * nmodes), 1, 1),
+                               block=(32, 32, 1),
+                               args=(b_aux,
+                                     pr,
+                                     np.int32(pr.shape[1]), np.int32(
+                                         pr.shape[2]),
+                                     ob,
+                                     obr, obc,
+                                     addr))
+
+    def fourier_error(self, f, addr, fmag, fmask, mask_sum):
+        fdev = self.gpu.fdev
+        ferr = self.gpu.ferr
+        if self.queue is not None:
+            self.queue.use()
+        self.fourier_error_cuda(grid=(int(fmag.shape[0]), 1, 1),
+                                block=(32, 32, 1),
+                                args=(np.int32(self.nmodes),
+                                      f,
+                                      fmask,
+                                      fmag,
+                                      fdev,
+                                      ferr,
+                                      mask_sum,
+                                      addr,
+                                      np.int32(self.fshape[1]),
+                                      np.int32(self.fshape[2])))
+
+    def error_reduce(self, addr, err_fmag):
+        # import sys
+        # float_size = sys.getsizeof(np.float32(4))
+        # shared_memory_size =int(2 * 32 * 32 *float_size) # this doesn't work even though its the same...
+        # shared_memory_size = int(49152)
+        if self.queue is not None:
+            self.queue.use()
+        self.error_reduce_cuda(grid=(int(err_fmag.shape[0]), 1, 1),
+                               block=(32, 32, 1),
+                               args=(self.gpu.ferr,
+                                     err_fmag,
+                                     np.int32(self.fshape[1]),
+                                     np.int32(self.fshape[2])))
+
+    def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
+        ferr = self.gpu.ferr
+        if self.queue is not None:
+            self.queue.use()
+        self.log_likelihood_cuda(grid=(int(mag.shape[0]), 1, 1),
+                                 block=(32, 32, 1),
+                                 args=(np.int32(self.nmodes),
+                                       b_aux,
+                                       mask,
+                                       mag,
+                                       addr,
+                                       ferr,
+                                       np.int32(self.fshape[1]),
+                                       np.int32(self.fshape[2])))
+        # TODO: we might want to move this call outside of here
+        self.error_reduce(addr, err_phot)
+
+    def log_likelihood_ml(self, b_aux, addr, I, weights, err_phot):
+        ferr = self.gpu.ferr
+        if self.queue is not None:
+            self.queue.use()
+        self.log_likelihood_ml_cuda(grid=(int(I.shape[0]), 1, 1),
+                                    block=(32, 32, 1),
+                                    args=(np.int32(self.nmodes),
+                                          b_aux,
+                                          weights,
+                                          I,
+                                          addr,
+                                          ferr,
+                                          np.int32(self.fshape[1]),
+                                          np.int32(self.fshape[2])))
+        # TODO: we might want to move this call outside of here
+        self.error_reduce(addr, err_phot)
+
+    def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum):
+        # assume all data is on GPU!
+        if self.queue is not None:
+            self.queue.use()
+        self.update_addr_and_error_state_cuda(grid=(
+            1, int((err_sum.shape[0] + 1) // 2), 1),
+            block=(32, 2, 1),
+            args=(addr, mangled_addr, error_state, err_sum,
+                  np.int32(addr.shape[1])))
+
+    def _cache_object_shape(self, ob):
+        oid = id(ob)
+
+        if not oid == self._ob_id:
+            self._ob_id = oid
+            self._ob_shape = (np.int32(ob.shape[-2]), np.int32(ob.shape[-1]))
+
+        return self._ob_shape
diff --git a/ptypy/accelerate/cuda_cupy/mem_utils.py b/ptypy/accelerate/cuda_cupy/mem_utils.py
new file mode 100644
index 000000000..a92a9657b
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/mem_utils.py
@@ -0,0 +1,319 @@
+import numpy as np
+import cupy as cp
+import cupyx
+from collections import deque
+
+
+def make_pagelocked_paired_arrays(ar):
+    mem = cupyx.empty_pinned(ar.shape, ar.dtype, order="C")
+    mem[:] = ar
+    return cp.asarray(mem), mem
+
+
+class GpuData:
+    """
+    Manages one block of GPU data with corresponding CPU data.
+    Keeps track of which cpu array is currently on GPU by its id,
+    and transfers if it's not already there.
+
+    To be used for the exit wave, ma, and mag arrays.
+    Note: Allocator should be pooled for best performance
+    """
+
+    def __init__(self, nbytes, syncback=False):
+        """
+        New instance of GpuData. Allocates the GPU-side array.
+
+        :param nbytes: Number of bytes held by this instance.
+        :param syncback: Should the data be synced back to CPU any time it's swapped out
+        """
+
+        self.gpu = None
+        self.gpuraw = cp.cuda.alloc(nbytes)
+        self.nbytes = nbytes
+        self.nbytes_buffer = nbytes
+        self.gpuId = None
+        self.cpu = None
+        self.syncback = syncback
+        self.ev_done = None
+
+    def _allocator(self, nbytes):
+        if nbytes > self.nbytes:
+            raise Exception('requested more bytes than maximum given before: {} vs {}'.format(
+                nbytes, self.nbytes))
+        return self.gpuraw
+
+    def record_done(self, stream):
+        self.ev_done = cp.cuda.Event()
+        with stream:
+            self.ev_done.record()
+
+    def to_gpu(self, cpu, id, stream):
+        """
+        Transfer cpu array to GPU on stream (async), keeping track of its id
+        """
+        if self.gpuId != id:
+            if self.syncback:
+                self.from_gpu(stream)
+            self.gpuId = id
+            self.cpu = cpu
+            if self.ev_done is not None:
+                self.ev_done.synchronize()
+            alloc = cp.cuda.get_allocator()
+            try:
+                cp.cuda.set_allocator(self._allocator)
+                with stream:
+                    self.gpu = cp.asarray(cpu)
+            finally:
+                cp.cuda.set_allocator(alloc)
+        return self.gpu
+
+    def from_gpu(self, stream):
+        """
+        Transfer data back to CPU, into same data handle it was copied from
+        before.
+        """
+        if self.cpu is not None and self.gpuId is not None and self.gpu is not None:
+            if self.ev_done is not None:
+                stream.wait_event(self.ev_done)
+            cp.cuda.runtime.memcpyAsync(dst=self.cpu.ctypes.data,
+                                        src=self.gpu.data.ptr,
+                                        size=self.gpu.nbytes,
+                                        kind=2,  # d2h
+                                        stream=stream.ptr)
+            self.ev_done = cp.cuda.Event()
+            self.ev_done.record(stream)
+
+    def resize(self, nbytes):
+        """
+        Resize the size of the underlying buffer, to allow re-use in different contexts.
+        Note that memory will only be freed/reallocated if the new number of bytes are
+        either larger than before, or if they are less than 90% of the original size -
+        otherwise it reuses the existing buffer
+        """
+        if nbytes > self.nbytes_buffer or nbytes < self.nbytes_buffer * .9:
+            self.nbytes_buffer = nbytes
+            self.gpuraw.mem.free()
+            self.gpuraw = cp.cuda.alloc(nbytes)
+
+        self.nbytes = nbytes
+        self.reset()
+
+    def reset(self):
+        """
+        Resets handles of cpu references and ids, so that all data will be transfered
+        again even if IDs match.
+        """
+        self.gpuId = None
+        self.cpu = None
+        self.ev_done = None
+
+    def free(self):
+        """
+        Free the underlying buffer on GPU - this object should not be used afterwards
+        """
+        self.gpuraw.mem.free()
+        self.gpuraw = None
+
+
+class GpuData2(GpuData):
+    """
+    Manages one block of GPU data with corresponding CPU data.
+    Keeps track of which cpu array is currently on GPU by its id,
+    and transfers if it's not already there.
+
+    To be used for the exit wave, ma, and mag arrays.
+    Note: Allocator should be pooled for best performance
+    """
+
+    def __init__(self, nbytes, syncback=False):
+        """
+        New instance of GpuData. Allocates the GPU-side array.
+
+        :param nbytes: Number of bytes held by this instance.
+        :param syncback: Should the data be synced back to CPU any time it's swapped out
+        """
+        self.done_what = None
+        super().__init__(nbytes, syncback)
+
+    def record_done(self, stream, what):
+        assert what in ['dtoh', 'htod', 'compute']
+        self.ev_done = cp.cuda.Event()
+        with stream:
+            self.ev_done.record()
+        self.done_what = what
+
+    def to_gpu(self, cpu, ident, stream):
+        """
+        Transfer cpu array to GPU on stream (async), keeping track of its id
+        """
+        ident = id(cpu) if ident is None else ident
+        if self.gpuId != ident:
+            if self.ev_done is not None:
+                stream.wait_event(self.ev_done)
+            # Safety measure. This is asynchronous, but it should still work
+            # Essentially we want to copy the data held in gpu array back to its CPU
+            # handle before the buffer can be reused.
+            if self.done_what != 'dtoh' and self.syncback:
+                # uploads on the download stream, easy to spot in nsight-sys
+                self.from_gpu(stream)
+            self.gpuId = ident
+            self.cpu = cpu
+            alloc = cp.cuda.get_allocator()
+            try:
+                cp.cuda.set_allocator(self._allocator)
+                with stream:
+                    self.gpu = cp.asarray(cpu)
+            finally:
+                cp.cuda.set_allocator(alloc)
+            self.record_done(stream, 'htod')
+        return self.ev_done, self.gpu
+
+    def from_gpu(self, stream):
+        """
+        Transfer data back to CPU, into same data handle it was copied from
+        before.
+        """
+        if self.cpu is not None and self.gpuId is not None and self.gpu is not None:
+            # Wait for any action recorded with this array
+            if self.ev_done is not None:
+                stream.wait_event(self.ev_done)
+            cp.cuda.runtime.memcpyAsync(dst=self.cpu.ctypes.data,
+                                        src=self.gpu.data.ptr,
+                                        size=self.gpu.nbytes,
+                                        kind=2,  # d2h
+                                        stream=stream.ptr)
+            self.record_done(stream, 'dtoh')
+            # Mark for reuse
+            self.gpuId = None
+            return self.ev_done
+        else:
+            return None
+
+
+class GpuDataManager:
+    """
+    Manages a set of GpuData instances, to keep several blocks on device.
+
+    Currently all blocks must be the same size.
+
+    Note that the syncback property is used so that during fourier updates,
+    the exit wave array is synced bck to cpu (it is updated),
+    while during probe update, it's not.
+    """
+
+    def __init__(self, nbytes, num, max=None, syncback=False):
+        """
+        Create an instance of GpuDataManager.
+        Parameters are the same as for GpuData, and num is the number of
+        GpuData instances to create (blocks on device).
+        """
+        self._syncback = syncback
+        self._nbytes = nbytes
+        self.data = []
+        self.max = max
+        for i in range(num):
+            self.add_data_block()
+
+    def add_data_block(self, nbytes=None):
+        """
+        Add a GpuData block.
+
+        Parameters
+        ----------
+        nbytes - Size of block
+
+        Returns
+        -------
+        """
+        if self.max is None or len(self) < self.max:
+            nbytes = nbytes if nbytes is not None else self._nbytes
+            self.data.append(GpuData2(nbytes, self._syncback))
+
+    @property
+    def syncback(self):
+        """
+        Get if syncback of data to CPU on swapout is enabled.
+        """
+        return self._syncback
+
+    @syncback.setter
+    def syncback(self, whether):
+        """
+        Adjust the syncback setting
+        """
+        self._syncback = whether
+        for d in self.data:
+            d.syncback = whether
+
+    @property
+    def nbytes(self):
+        """
+        Get the number of bytes in each block
+        """
+        return self.data[0].nbytes
+
+    @property
+    def memory(self):
+        """
+        Get all memory occupied by all blocks
+        """
+        m = 0
+        for d in self.data:
+            m += d.nbytes_buffer
+        return m
+
+    def __len__(self):
+        return len(self.data)
+
+    def reset(self, nbytes, num):
+        """
+        Reset this object as if these parameters were given to the constructor.
+        The syncback property is untouched.
+        """
+        sync = self.syncback
+        # remove if too many, explictly freeing memory
+        for i in range(num, len(self.data)):
+            self.data[i].free()
+        # cut short if too many
+        self.data = self.data[:num]
+        # reset existing
+        for d in self.data:
+            d.resize(nbytes)
+        # append new ones
+        for i in range(len(self.data), num):
+            self.data.append(GpuData2(nbytes, sync))
+
+    def free(self):
+        """
+        Explicitly clear all data blocks - same as resetting to 0 blocks
+        """
+        self.reset(0, 0)
+
+    def to_gpu(self, cpu, id, stream, pop_id="none"):
+        """
+        Transfer a block to the GPU, given its ID and CPU data array
+        """
+        idx = 0
+        for x in self.data:
+            if x.gpuId == id or x.gpuId == pop_id:
+                break
+            idx += 1
+        if idx == len(self.data):
+            idx = 0
+        else:
+            pass
+        m = self.data.pop(idx)
+        self.data.append(m)
+        #print("Swap %s for %s and move from %d to %d" % (m.gpuId,id,idx,len(self.data)))
+        ev, gpu = m.to_gpu(cpu, id, stream)
+        # return the wait event, the gpu array and the function to register a finished computation
+        return ev, gpu, m
+
+    def sync_to_cpu(self, stream):
+        """
+        Sync back all data to CPU
+        """
+        for x in self.data:
+            x.from_gpu(stream)
+
diff --git a/ptypy/accelerate/cuda_cupy/multi_gpu.py b/ptypy/accelerate/cuda_cupy/multi_gpu.py
new file mode 100644
index 000000000..79f511423
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/multi_gpu.py
@@ -0,0 +1,151 @@
+"""
+Multi-GPU AllReduce Wrapper, that uses NCCL via cupy if it's available,
+and otherwise falls back to CUDA-aware MPI,
+and if that doesn't work, uses host/device copies with regular MPI.
+
+Findings:
+
+1) OpenMPI with CUDA support needs to be available, and:
+  - mpi4py needs to be compiled from master (3.1.0a - latest stable release 3.0.x doesn't have it)
+  - OpenMPI in a conda install needs to have the environment variable
+  --> if cuda support isn't enabled, the application simply crashes with a seg fault
+
+2) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
+   It should be in DEFAULT mode.
+
+"""
+
+from pkg_resources import parse_version
+import numpy as np
+import cupy as cp
+from ptypy.utils import parallel
+from ptypy.utils.verbose import logger, log
+import os
+from cupy.cuda import nccl
+
+try:
+    import mpi4py
+except ImportError:
+    mpi4py = None
+
+# properties to check which versions are available
+
+# use NCCL if it is available, and the user didn't override the
+# default selection with environment variables
+have_nccl = (not 'PTYPY_USE_CUDAMPI' in os.environ) and \
+    (not 'PTYPY_USE_MPI' in os.environ)
+
+# At the moment, we require:
+# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true,
+# mpi4py >= 3.1.0
+# and not setting the PTYPY_USE_MPI environment variable
+#
+# -> we ideally want to allow enabling support from a parameter in ptypy
+have_cuda_mpi = (mpi4py is not None) and \
+    "OMPI_MCA_opal_cuda_support" in os.environ and \
+    os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \
+    parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \
+    not ('PTYPY_USE_MPI' in os.environ)
+
+
+class MultiGpuCommunicatorBase:
+    """Base class for multi-GPU communicator options, to aggregate common bits"""
+
+    def __init__(self):
+        self.rank = parallel.rank
+        self.ndev = parallel.size
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+        # base class only checks properties of arrays
+        assert isinstance(arr, cp.ndarray), "Input must be a GPU Array"
+
+
+class MultiGpuCommunicatorMpi(MultiGpuCommunicatorBase):
+    """Communicator for AllReduce that uses MPI on the CPU, i.e. D2H, allreduce, H2D"""
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+        super().allReduceSum(arr)
+
+        if parallel.MPIenabled:
+            # note: this creates a temporary CPU array
+            data = arr.get()
+            parallel.allreduce(data)
+            arr.set(data)
+
+
+class MultiGpuCommunicatorCudaMpi(MultiGpuCommunicatorBase):
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+
+        if parallel.MPIenabled:
+            comm = parallel.comm
+            comm.Allreduce(parallel.MPI.IN_PLACE, arr)
+
+
+class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase):
+
+    def __init__(self):
+        super().__init__()
+
+        # Check if GPUs are in default mode
+        if cp.cuda.Device().attributes["ComputeMode"] != 0:   ## ComputeModeDefault
+            raise RuntimeError(
+                "Compute mode must be default in order to use NCCL")
+
+        # get a unique identifier for the NCCL communicator and
+        # broadcast it to all MPI processes (assuming one device per process)
+        if self.rank == 0:
+            self.id = nccl.get_unique_id()
+        else:
+            self.id = None
+
+        self.id = parallel.bcast(self.id)
+
+        self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank)
+
+    def allReduceSum(self, arr):
+        """Call MPI.all_reduce in-place, with array on GPU"""
+
+        count, datatype = self.__get_NCCL_count_dtype(arr)
+
+        self.com.allReduce(arr.data.ptr, arr.data.ptr, count, datatype, nccl.NCCL_SUM, 
+            cp.cuda.get_current_stream().ptr)
+
+    def __get_NCCL_count_dtype(self, arr):
+        if arr.dtype == np.complex64:
+            return arr.size*2, nccl.NCCL_FLOAT32
+        elif arr.dtype == np.complex128:
+            return arr.size*2, nccl.NCCL_FLOAT64
+        elif arr.dtype == np.float32:
+            return arr.size, nccl.NCCL_FLOAT32
+        elif arr.dtype == np.float64:
+            return arr.size, nccl.NCCL_FLOAT64
+        else:
+            raise ValueError("This dtype is not supported by NCCL.")
+
+
+# pick the appropriate communicator depending on installed packages
+def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True):
+    if have_nccl and use_nccl:
+        try:
+            comm = MultiGpuCommunicatorNccl()
+            log(4, "Using NCCL communicator")
+            return comm
+        except RuntimeError:
+            pass
+        except AttributeError:
+            # see issue #323
+            pass
+    if have_cuda_mpi and use_cuda_mpi:
+        try:
+            comm = MultiGpuCommunicatorCudaMpi()
+            log(4, "Using CUDA-aware MPI communicator")
+            return comm
+        except RuntimeError:
+            pass
+    comm = MultiGpuCommunicatorMpi()
+    log(4, "Using MPI communicator")
+    return comm
diff --git a/ptypy/accelerate/cuda_cupy/porting_notes.md b/ptypy/accelerate/cuda_cupy/porting_notes.md
new file mode 100644
index 000000000..a10869492
--- /dev/null
+++ b/ptypy/accelerate/cuda_cupy/porting_notes.md
@@ -0,0 +1,60 @@
+# PyCUDA to CuPy Porting Notes
+
+This file collects notes for things to consider and issues that were fixed when 
+porting the pycuda code to cupy.
+
+## Simple Conversions
+
+- `gpuarray.to_gpu` => `cp.asarray`
+- `gpuarray.zeros`, etc, typically have cupy equivalents in `cp.`
+- `gpuarray.get` generally works with `cp.get` as well, but cupy has a more descriptive `cp.asnumpy` as well
+- all functions that don't have a direct numpy equivalent are in `cupyx` rather than `cupy`
+  (for example for pinned arrays)
+- raw data pointers to GPU arrays can be retrieved with `x.data.ptr`
+- raw data pointers to streams: `stream.ptr`
+- low-level APIs, are closer to the standard CUDA runtime calls and are in `cupy.cuda.runtime` module, for example `memcpyAsync`
+- streams are not parameters, but rather contexts:
+
+```python
+stream = cp.cuda.Stream()
+with stream:
+  ... # kernel calls etc will go onto this stream
+
+# alternative:
+stream.use()
+... # next kernel calls will use that stream
+```
+
+
+## Sticky Points
+
+### Memory Pool
+
+- cupy uses a device memory pool by default, which re-uses freed memory blocks
+- the pool is empty at the start and new allocations are using the regular cudaAlloc functions
+- once blocks are freed, they are not given back to the device with cudaFree, but are rather
+  kept in a free list and re-used in further allocations
+- therefore the flag for using device memory pool that some engines had made no sense
+- this also affects are total available memory should be calculated - it is in fact the free 
+  device memory + the free memory in the pool
+
+### Page-locked Memory Pool
+
+- cupy also uses a `PinnedMemoryPool` for obtaining page-locked blocks
+- these will be kept in a free list when they are not required anymore
+- it works similar to the `DeviceMemoryPool`
+
+### Context Management
+
+- cupy does not have explicit context creation or deletion of the context 
+- everything runs in the CUDA runtime's default context (created on first use by default)
+- no functions are available to pop the context (as in PyCuda), so need to be
+  careful with cleanup
+
+
+### Kernel Compilation
+
+- cupy uses NVTRC, which is slightly different to NVCC. 
+- the generated device code is not exactly the same for some reason
+- Kernels might therefore perform a little bit different - faster or slower, but tests showed
+  that they are largely equivalent in performance
diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py
index e6c51d49f..3ce6a7a6e 100644
--- a/ptypy/accelerate/cuda_pycuda/__init__.py
+++ b/ptypy/accelerate/cuda_pycuda/__init__.py
@@ -2,9 +2,8 @@
 from pycuda.compiler import SourceModule
 import numpy as np
 import os
-# debug_options = []
-# debug_options = ['-O0', '-G', '-g']
-debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags
+kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common'))
+debug_options = ['-O3', '-DNDEBUG', '-lineinfo', '-I' + kernel_dir] # release mode flags
 
 # C++14 support was added with CUDA 9, so we only enable the flag there
 if cuda.get_version()[0] >= 9:
@@ -46,11 +45,11 @@ def load_kernel(name, subs={}, file=None):
 
     if file is None:
         if isinstance(name, str):
-            fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name)
+            fn = "%s/%s.cu" % (kernel_dir, name)
         else:
             raise ValueError("name parameter must be a string if not filename is given")
     else:
-        fn = "%s/cuda/%s" % (os.path.dirname(__file__), file)
+        fn = "%s/%s" % (kernel_dir, file)
 
     with open(fn, 'r') as f:
         kernel = f.read()
diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py
index 7c2de8f3f..2abd02ba4 100644
--- a/ptypy/accelerate/cuda_pycuda/array_utils.py
+++ b/ptypy/accelerate/cuda_pycuda/array_utils.py
@@ -1,26 +1,11 @@
+from ptypy.accelerate.cuda_common.utils import map2ctype
+
 from . import load_kernel
 from pycuda import gpuarray
 import pycuda.driver as cuda
 from ptypy.utils import gaussian
 import numpy as np
 
-# maps a numpy dtype to the corresponding C type
-def map2ctype(dt):
-    if dt == np.float32:
-        return 'float'
-    elif dt == np.float64:
-        return 'double'
-    elif dt == np.complex64:
-        return 'complex<float>'
-    elif dt == np.complex128:
-        return 'complex<double>'
-    elif dt == np.int32:
-        return 'int'
-    elif dt == np.int64:
-        return 'long long'
-    else:
-        raise ValueError('No mapping for {}'.format(dt))
-
 
 class ArrayUtilsKernel:
     def __init__(self, acc_dtype=np.float64, queue=None):
diff --git a/pyproject.toml b/pyproject.toml
index dcfadf665..635431745 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -68,4 +68,4 @@ ptypy = "ptypy"
 
 [tool.setuptools.package-data]
 ptypy = ["resources/*",]
-"ptypy.accelerate.cuda_pycuda.cuda" = ["*.cu"]
\ No newline at end of file
+"ptypy.accelerate.cuda_common" = ["*.cu", "*.cuh"]
\ No newline at end of file
diff --git a/templates/accelerate/ptypy_minimal_prep_and_run_cupy.py b/templates/accelerate/ptypy_minimal_prep_and_run_cupy.py
new file mode 100644
index 000000000..14c862fb1
--- /dev/null
+++ b/templates/accelerate/ptypy_minimal_prep_and_run_cupy.py
@@ -0,0 +1,54 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DM_cupy'
+p.engines.engine00.numiter = 80
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/test/accelerate_tests/cuda_cupy_tests/__init__.py b/test/accelerate_tests/cuda_cupy_tests/__init__.py
new file mode 100644
index 000000000..7df79ac0f
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/__init__.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+import importlib
+
+# shall we run performance tests?
+perfrun = False
+
+def have_cupy():
+    if importlib.util.find_spec('cupy') is None:
+        return False
+    try:
+        import cupy as cp
+        cp.cuda.Device(0).compute_capability
+        return True
+    except cp.cuda.runtime.CUDARuntimeError:
+        return False
+
+if have_cupy():
+    import cupy as cp
+
+@unittest.skipIf(not have_cupy(), "no cupy available")
+class CupyCudaTest(unittest.TestCase):
+    
+    def setUp(self):
+        import sys
+        np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf)
+        self.stream = cp.cuda.Stream()
+        self.stream.use()
+        
+    def tearDown(self):
+        np.set_printoptions()
+        # back to default stream
+        cp.cuda.Stream.null.use()
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py b/test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py
new file mode 100644
index 000000000..c59fb852d
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py
@@ -0,0 +1,77 @@
+import unittest
+import numpy as np
+from . import perfrun, CupyCudaTest, have_cupy
+from ptypy.accelerate.base import address_manglers as am
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy import address_manglers as gam
+
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class AddressManglersTest(CupyCudaTest):
+
+    def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3):
+        total_number_scan_positions = scan_pts ** 2
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions)) + max_bound  # max bound is added in the DM_serial engine.
+        Y = Y.reshape((total_number_scan_positions)) + max_bound
+
+        addr_original = np.zeros((total_number_scan_positions, num_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(num_modes):
+                for ob_mode in range(1):
+                    addr_original[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+        
+        return addr_original
+
+    def test_get_address_REGRESSION(self):
+        # the other manglers are using the BaseMangler's get_address function
+        # so we set the deltas in a BaseMangler object and test get_address
+
+        scan_pts=2
+        addr_original = self.prepare_addresses(scan_pts=scan_pts)
+        addr_original_dev = cp.asarray(addr_original)
+        nshifts=1
+        step_size=2
+        mglr = gam.BaseMangler(step_size, 50, 100, nshifts, max_bound=2)
+        # 2 shifts, with positive/negative shifting
+        mglr.delta = np.array([
+            [1, 2], 
+            [-4, -2]
+        ], dtype=np.int32)
+        mglr._setup_delta_gpu()
+        
+        addr1 = addr_original_dev.copy()
+        mglr.get_address(0, addr_original_dev, addr1, 10, 9)
+        
+        addr2 = addr_original_dev.copy()
+        mglr.get_address(1, addr_original_dev, addr2, 10, 9)
+
+        exp1 = np.copy(addr_original)
+        exp2 = np.copy(addr_original)
+        # element-wise here to prepare reference
+        for f in range(addr_original.shape[0]):
+            for m in range(addr_original.shape[1]):
+                exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1))
+                exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2))
+                exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4))
+                exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2))
+
+        np.testing.assert_array_equal(addr2.get(), exp2)
+        np.testing.assert_array_equal(addr1.get(), exp1)
+        
diff --git a/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py
new file mode 100644
index 000000000..0c018b205
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py
@@ -0,0 +1,536 @@
+import unittest
+import numpy as np
+from . import perfrun, CupyCudaTest, have_cupy
+from ptypy.accelerate.base import array_utils as au
+
+if have_cupy():
+    import cupy as cp
+    import ptypy.accelerate.cuda_cupy.array_utils as gau
+
+
+class ArrayUtilsTest(CupyCudaTest):
+
+    def test_dot_float_float(self):
+        # Arrange
+        X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32)
+        A = 10 ** Y
+        A_dev = cp.asarray(A)
+
+        # Act
+        AU = gau.ArrayUtilsKernel(acc_dtype=np.float32)
+        out_dev = AU.dot(A_dev, A_dev)
+        out = cp.asnumpy(out_dev)
+
+        # Assert
+        np.testing.assert_allclose(out, 30333303.0, rtol=1e-7)
+
+    def test_dot_float_double(self):
+        # Arrange
+        X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32)
+        A = 10 ** Y
+        A_dev = cp.asarray(A)
+
+        # Act
+        AU = gau.ArrayUtilsKernel(acc_dtype=np.float64)
+        out_dev = AU.dot(A_dev, A_dev)
+        out = cp.asnumpy(out_dev)
+
+        # Assert
+        np.testing.assert_equal(out, 30333303.0)
+
+    def test_dot_complex_float(self):
+        # Arrange
+        X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32)
+        A = 10 ** Y + 1j * 10 ** X
+        A = A.astype(np.complex64)
+        A_dev = cp.asarray(A)
+
+        # Act
+        AU = gau.ArrayUtilsKernel(acc_dtype=np.float32)
+        out_dev = AU.dot(A_dev, A_dev)
+        out = cp.asnumpy(out_dev)
+
+        # Assert
+        np.testing.assert_allclose(out, 60666606.0, rtol=1e-7)
+
+    def test_dot_complex_double(self):
+        # Arrange
+        X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32)
+        A = 10 ** Y + 1j * 10 ** X
+        A_dev = cp.asarray(A)
+
+        # Act
+        AU = gau.ArrayUtilsKernel(acc_dtype=np.float64)
+        out_dev = AU.dot(A_dev, A_dev)
+        out = cp.asnumpy(out_dev)
+
+        # Assert
+        np.testing.assert_array_equal(out, 60666606.0)
+
+    @unittest.skipIf(not perfrun, "Performance test")
+    def test_dot_performance(self):
+        # Arrange
+        X, Y, Z = np.indices((3, 3, 1021301), dtype=np.float32)
+        A = 10 ** Y + 1j * 10 ** X
+        A_dev = cp.asarray(A)
+
+        # Act
+        AU = gau.ArrayUtilsKernel(acc_dtype=np.float64)
+        AU.dot(A_dev, A_dev)
+
+    def test_transpose_2D(self):
+        # Arrange
+        inp, _ = np.indices((5, 3), dtype=np.int32)
+        inp_dev = cp.asarray(inp)
+        out_dev = cp.empty((3, 5), dtype=np.int32)
+
+        # Act
+        AU = gau.TransposeKernel()
+        AU.transpose(inp_dev, out_dev)
+
+        # Assert
+        out_exp = np.transpose(inp, (1, 0))
+        out = cp.asnumpy(out_dev)
+        np.testing.assert_array_equal(out, out_exp)
+
+    def test_transpose_2D_large(self):
+        # Arrange
+        inp, _ = np.indices((137, 61), dtype=np.int32)
+        inp_dev = cp.asarray(inp)
+        out_dev = cp.empty((61, 137), dtype=np.int32)
+
+        # Act
+        AU = gau.TransposeKernel()
+        AU.transpose(inp_dev, out_dev)
+
+        # Assert
+        out_exp = np.transpose(inp, (1, 0))
+        out = cp.asnumpy(out_dev)
+        np.testing.assert_array_equal(out, out_exp)
+
+    def test_transpose_4D(self):
+        # Arrange
+        inp = np.random.randint(0, 10000, (250, 3, 5, 3),
+                                dtype=np.int32)  # like addr
+        inp_dev = cp.asarray(inp)
+        out_dev = cp.empty((5, 3, 250, 3), dtype=np.int32)
+
+        # Act
+        AU = gau.TransposeKernel()
+        AU.transpose(inp_dev.reshape(750, 15), out_dev.reshape(15, 750))
+
+        # Assert
+        out_exp = np.transpose(inp, (2, 3, 0, 1))
+        out = cp.asnumpy(out_dev)
+        np.testing.assert_array_equal(out, out_exp)
+
+    def test_complex_gaussian_filter_1d_no_blurring_UNITY(self):
+        # Arrange
+        data = np.zeros((11,), dtype=np.complex64)
+        data[5] = 1.0 +1.0j
+        mfs = [0]
+        data_dev = cp.asarray(data)
+        tmp_dev = cp.empty((11,), dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+        self.assertTrue(np.testing.assert_allclose(out_exp, out, rtol=1e-5) is None)
+
+    def test_complex_gaussian_filter_1d_little_blurring_UNITY(self):
+        # Arrange
+        data = np.zeros((11,), dtype=np.complex64)
+        data[5] = 1.0 +1.0j
+        mfs = [0.2]
+        data_dev = cp.asarray(data)
+        tmp_dev = cp.empty((11,), dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+        np.testing.assert_allclose(out_exp, out, rtol=1e-5)
+
+
+    def test_complex_gaussian_filter_1d_more_blurring_UNITY(self):
+        # Arrange
+        data = np.zeros((11,), dtype=np.complex64)
+        data[5] = 1.0 +1.0j
+        mfs = [2.0]
+        data_dev = cp.asarray(data)
+        tmp_dev = cp.empty((11,), dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+        np.testing.assert_allclose(out_exp, out, rtol=1e-5)
+
+    def test_complex_gaussian_filter_2d_no_blurring_UNITY(self):
+        # Arrange
+        data = np.zeros((11, 11), dtype=np.complex64)
+        data[5, 5] = 1.0+1.0j
+        mfs = 0.0,0.0
+        data_dev = cp.asarray(data)
+        tmp_dev = cp.empty((11,11), dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+        np.testing.assert_allclose(out_exp, out, rtol=1e-5)
+
+    def test_complex_gaussian_filter_2d_little_blurring_UNITY(self):
+        # Arrange
+        data = np.zeros((11, 11), dtype=np.complex64)
+        data[5, 5] = 1.0+1.0j
+        mfs = 0.2,0.2
+        data_dev = cp.asarray(data)
+        tmp_dev = cp.empty((11,11),dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+        np.testing.assert_allclose(out_exp, out, rtol=1e-5)
+
+    def test_complex_gaussian_filter_2d_more_blurring_UNITY(self):
+        # Arrange
+        data = np.zeros((8, 8), dtype=np.complex64)
+        data[3:5, 3:5] = 2.0+2.0j
+        mfs = 3.0,4.0
+        data_dev = cp.asarray(data)
+        #tmp_dev = cp.empty((8,8), dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+        np.testing.assert_allclose(out_exp, out, rtol=1e-4)
+
+    def test_complex_gaussian_filter_2d_nonsquare_UNITY(self):
+        # Arrange
+        data = np.zeros((32, 16), dtype=np.complex64)
+        data[3:4, 11:12] = 2.0+2.0j
+        data[3:5, 3:5] = 2.0+2.0j
+        data[20:25,3:5] = 2.0+2.0j
+        mfs = 1.0,1.0
+        data_dev = cp.asarray(data)
+        tmp_dev = cp.empty(data_dev.shape, dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+
+        np.testing.assert_allclose(out_exp, out, rtol=1e-4)
+
+    def test_complex_gaussian_filter_2d_batched(self):
+        # Arrange
+        batch_number = 2
+        A = 5
+        B = 5
+        data = np.zeros((batch_number, A, B), dtype=np.complex64)
+        data[:, 2:3, 2:3] = 2.0+2.0j
+        mfs = 3.0,4.0
+        data_dev = cp.asarray(data)
+        tmp_dev = cp.empty((batch_number,A,B), dtype=np.complex64)
+
+        # Act
+        GS = gau.GaussianSmoothingKernel()
+        GS.convolution(data_dev, mfs, tmp=tmp_dev)
+
+        # Assert
+        out_exp = au.complex_gaussian_filter(data, mfs)
+        out = data_dev.get()
+        np.testing.assert_allclose(out_exp, out, rtol=1e-4)
+
+
+
+    def test_crop_pad_simple_1_UNITY(self):
+        # pad, integer, 2D
+        B = np.indices((4, 4), dtype=np.int32).sum(0)
+        A = np.zeros((6, 6), dtype=B.dtype)
+        B_dev = cp.asarray(B)
+        A_dev = cp.asarray(A)
+
+        # Act
+        au.crop_pad_2d_simple(A, B)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, B_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_2_UNITY(self):
+        # crop, float, 3D
+        B = np.indices((4, 4), dtype=np.float32)
+        A = np.zeros((2, 2, 2), dtype=B.dtype)
+        B_dev = cp.asarray(B)
+        A_dev = cp.asarray(A)
+
+        # Act
+        au.crop_pad_2d_simple(A, B)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, B_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_3_UNITY(self):
+        # crop/pad, complex, 3D
+        B = np.indices((4, 3), dtype=np.complex64)
+        B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :]
+        A = np.zeros((2, 2, 5), dtype=B.dtype)
+        B_dev = cp.asarray(B)
+        A_dev = cp.asarray(A)
+
+        # Act
+        au.crop_pad_2d_simple(A, B)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, B_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_difflike_UNITY(self):
+        np.random.seed(1983)
+        # crop/pad, 4D
+        D = np.random.randint(0, 3000, (100, 256, 256)).astype(np.float32)
+        A = np.zeros((100, 260, 260), dtype=D.dtype)
+        B = np.zeros((100, 250, 250), dtype=D.dtype)
+        B_dev = cp.asarray(B)
+        A_dev = cp.asarray(A)
+        D_dev = cp.asarray(D)
+
+        # Act
+        au.crop_pad_2d_simple(A, D)
+        au.crop_pad_2d_simple(B, D)
+        k = gau.CropPadKernel(queue=self.stream)
+        k.crop_pad_2d_simple(A_dev, D_dev)
+        k.crop_pad_2d_simple(B_dev, D_dev)
+
+        # Assert
+        np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6)
+        np.testing.assert_allclose(B, B_dev.get(), rtol=1e-6, atol=1e-6)
+
+    def test_crop_pad_simple_oblike_UNITY(self):
+        np.random.seed(1983)
+        X = np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32)
+
+        out = np.zeros((1,), dtype=np.float32)
+        X_dev = cp.asarray(X)
+        out_dev = cp.asarray(out)
+
+        out = au.max_abs2(X)
+
+        MAK = gau.MaxAbs2Kernel(queue=self.stream)
+        MAK.max_abs2(X_dev, out_dev)
+
+        np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object norm array has not been updated as expected")
+
+    def test_max_abs2_complex_UNITY(self):
+        np.random.seed(1983)
+        X = (np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32) +
+             1j * np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32)).astype(np.complex64)
+        out = np.zeros((1,), dtype=np.float32)
+        X_dev = cp.asarray(X)
+        out_dev = cp.asarray(out)
+
+        out = au.max_abs2(X)
+
+        MAK = gau.MaxAbs2Kernel(queue=self.stream)
+        MAK.max_abs2(X_dev, out_dev)
+
+        np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object norm array has not been updated as expected")
+
+    def test_max_abs2_float_UNITY(self):
+        np.random.seed(1983)
+        X = np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32)
+
+        out = np.zeros((1,), dtype=np.float32)
+        X_dev = cp.asarray(X)
+        out_dev = cp.asarray(out)
+
+        out = au.max_abs2(X)
+
+        MAK = gau.MaxAbs2Kernel(queue=self.stream)
+        MAK.max_abs2(X_dev, out_dev)
+
+        np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object norm array has not been updated as expected")
+
+
+    def test_clip_magnitudes_to_range_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((2,10,10))
+        B = A[0] + 1j* A[1]
+        B = B.astype(np.complex64)
+        B_gpu = cp.asarray(B)
+
+        au.clip_complex_magnitudes_to_range(B, 0.2,0.8)
+        CMK = gau.ClipMagnitudesKernel()
+        CMK.clip_magnitudes_to_range(B_gpu, 0.2, 0.8)
+
+        np.testing.assert_allclose(B_gpu.get(), B, rtol=1e-6, atol=1e-6,
+            err_msg="The magnitudes of the array have not been clipped as expected")
+
+    def test_mass_center_2d_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((128, 128)).astype(np.float32)
+        A_gpu = cp.asarray(A)
+
+        out = au.mass_center(A)
+
+        MCK = gau.MassCenterKernel()
+        mc_d = MCK.mass_center(A_gpu)
+        mc = mc_d.get()
+
+        np.testing.assert_allclose(out, mc, rtol=1e-6, atol=1e-6,
+            err_msg="The centre of mass of the array has not been calculated as expected")
+
+
+    def test_mass_center_3d_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((128, 128, 128)).astype(np.float32)
+        A_gpu = cp.asarray(A)
+
+        out = au.mass_center(A)
+
+        MCK = gau.MassCenterKernel()
+        mc_d = MCK.mass_center(A_gpu)
+        mc = mc_d.get()
+
+        np.testing.assert_allclose(out, mc, rtol=1e-6, atol=1e-6,
+            err_msg="The centre of mass of the array has not been calculated as expected")
+
+    def test_abs2sum_complex_float_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((3, 321, 123)).astype(np.float32)
+        B = A + A**2 * 1j
+        B_gpu = cp.asarray(B)
+
+        out = au.abs2(B).sum(0)
+
+        A2SK = gau.Abs2SumKernel(dtype=B_gpu.dtype)
+        a2s_d = A2SK.abs2sum(B_gpu)
+        a2s = a2s_d.get()
+
+        np.testing.assert_allclose(out, a2s, rtol=1e-6, atol=1e-6,
+            err_msg="The sum of absolute values along the first dimension has not been calculated as expected")
+
+    def test_abs2sum_complex_double_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((3, 321, 123)).astype(np.float64)
+        B = A + A**2 * 1j
+        B_gpu = cp.asarray(B)
+
+        out = au.abs2(B).sum(0)
+
+        A2SK = gau.Abs2SumKernel(dtype=B_gpu.dtype)
+        a2s_d = A2SK.abs2sum(B_gpu)
+        a2s = a2s_d.get()
+
+        np.testing.assert_allclose(out, a2s, rtol=1e-6, atol=1e-6,
+            err_msg="The sum of absolute values along the first dimension has not been calculated as expected")
+
+    def test_interpolate_shift_2D_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((259, 252)).astype(np.float32)
+        A = A + A**2 * 1j
+        A_gpu = cp.asarray(A)
+
+        cen_old = np.array([100.123, 5.678]).astype(np.float32)
+        cen_new = np.array([128.5, 127.5]).astype(np.float32)
+        shift = cen_new - cen_old
+
+        out = au.interpolated_shift(A, shift, do_linear=True)
+
+        ISK = gau.InterpolatedShiftKernel()
+        isk_d = ISK.interpolate_shift(A_gpu, shift)
+        isk = isk_d.get()
+
+        np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6,
+            err_msg="The shifting of array has not been calculated as expected")
+
+    def test_interpolate_shift_3D_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((3, 200, 300)).astype(np.float32)
+        A = A + A**2 * 1j
+        A_gpu = cp.asarray(A)
+
+        cen_old = np.array([0., 180.123, 5.678]).astype(np.float32)
+        cen_new = np.array([0., 128.5, 127.5]).astype(np.float32)
+        shift = cen_new - cen_old
+
+        out = au.interpolated_shift(A, shift, do_linear=True)
+
+        ISK = gau.InterpolatedShiftKernel()
+        isk_d = ISK.interpolate_shift(A_gpu, shift[1:])
+        isk = isk_d.get()
+
+        np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6,
+            err_msg="The shifting of array has not been calculated as expected")
+
+    def test_interpolate_shift_integer_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((3, 200, 300)).astype(np.float32)
+        A = A + A**2 * 1j
+        A_gpu = cp.asarray(A)
+
+        cen_old = np.array([0, 180, 5]).astype(np.float32)
+        cen_new = np.array([0, 128, 127]).astype(np.float32)
+        shift = cen_new - cen_old
+
+        out = au.interpolated_shift(A, shift, do_linear=True)
+
+        ISK = gau.InterpolatedShiftKernel()
+        isk_d = ISK.interpolate_shift(A_gpu, shift[1:])
+        isk = isk_d.get()
+
+        np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6,
+            err_msg="The shifting of array has not been calculated as expected")
+
+    def test_interpolate_shift_no_shift_UNITY(self):
+        np.random.seed(1987)
+        A = np.random.random((3, 200, 300)).astype(np.float32)
+        A = A + A**2 * 1j
+        A_gpu = cp.asarray(A)
+
+        cen_old = np.array([0, 0, 0]).astype(np.float32)
+        cen_new = np.array([0, 0, 0]).astype(np.float32)
+        shift = cen_new - cen_old
+
+        out = au.interpolated_shift(A, shift, do_linear=True)
+
+        ISK = gau.InterpolatedShiftKernel()
+        isk_d = ISK.interpolate_shift(A_gpu, shift[1:])
+        isk = isk_d.get()
+
+        np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6,
+            err_msg="The shifting of array has not been calculated as expected")
+
diff --git a/test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py
new file mode 100644
index 000000000..df27077e2
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py
@@ -0,0 +1,666 @@
+'''
+
+
+'''
+
+import unittest
+import numpy as np
+from . import perfrun, CupyCudaTest, have_cupy
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.kernels import AuxiliaryWaveKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class AuxiliaryWaveKernelTest(CupyCudaTest):
+
+    def prepare_arrays(self, performance=False, scan_points=None):
+        if not performance:
+            B = 3  # frame size y
+            C = 3  # frame size x
+            D = 2  # number of probe modes
+            E = B  # probe size y
+            F = C  # probe size x
+
+            npts_greater_than = 2  # how many points bigger than the probe the object is.
+            G = 2  # number of object modes
+            if scan_points is None:
+                scan_pts = 2  # one dimensional scan point number
+            else:
+                scan_pts = scan_points
+        else:
+            B = 128
+            C = 128
+            D = 2
+            E = B
+            F = C
+            npts_greater_than = 1215
+            G = 4
+            if scan_points is None:
+                scan_pts = 14
+            else:
+                scan_pts = scan_points
+
+        H = B + npts_greater_than  # object size y
+        I = C + npts_greater_than  # object size x
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+        if performance:
+            print('addr={}, obj={}, pr={}, ex={}'.format(addr.shape, object_array.shape, probe.shape, exit_wave.shape))
+            # assert False
+
+        return addr, object_array, probe, exit_wave
+
+    def copy_to_gpu(self, addr, object_array, probe, exit_wave):
+        return (cp.asarray(addr), 
+            cp.asarray(object_array), 
+            cp.asarray(probe), 
+            cp.asarray(exit_wave))
+
+    def test_init(self):
+        # should we really test for private attributes? 
+        # Only the public interface should be checked - what clients rely on
+        attrs = ["_ob_shape",
+                 "_ob_id"]
+
+        AWK = AuxiliaryWaveKernel(self.stream)
+        for attr in attrs:
+            self.assertTrue(hasattr(AWK, attr), msg="AuxiliaryWaveKernel does not have attribute: %s" % attr)
+
+        np.testing.assert_equal(AWK.kernels,
+                                ['build_aux', 'build_exit'],
+                                err_msg='AuxiliaryWaveKernel does not have the correct functions registered.')
+
+    def test_build_aux_same_as_exit_REGRESSION(self):
+        ## Arrange
+        cpudata = self.prepare_arrays()
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(*cpudata)
+        auxiliary_wave = cp.zeros_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        alpha_set = FLOAT_TYPE(1.0)
+
+        AWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set)
+
+        expected_auxiliary_wave = np.array([[[-1. + 3.j,  -1. + 3.j,  -1. + 3.j],
+                                             [-1. + 3.j,  -1. + 3.j,  -1. + 3.j],
+                                             [-1. + 3.j,  -1. + 3.j,  -1. + 3.j]],
+                                            [[-2.+14.j,  -2.+14.j,  -2.+14.j],
+                                             [-2.+14.j,  -2.+14.j,  -2.+14.j],
+                                             [-2.+14.j,  -2.+14.j,  -2.+14.j]],
+                                            [[-3. + 5.j,  -3. + 5.j,  -3. + 5.j],
+                                             [-3. + 5.j,  -3. + 5.j,  -3. + 5.j],
+                                             [-3. + 5.j,  -3. + 5.j,  -3. + 5.j]],
+                                            [[-4.+28.j,  -4.+28.j,  -4.+28.j],
+                                             [-4.+28.j,  -4.+28.j,  -4.+28.j],
+                                             [-4.+28.j,  -4.+28.j,  -4.+28.j]],
+                                            [[-5. - 1.j,  -5. - 1.j,  -5. - 1.j],
+                                             [-5. - 1.j,  -5. - 1.j,  -5. - 1.j],
+                                             [-5. - 1.j,  -5. - 1.j,  -5. - 1.j]],
+                                            [[-6.+10.j,  -6.+10.j,  -6.+10.j],
+                                             [-6.+10.j,  -6.+10.j,  -6.+10.j],
+                                             [-6.+10.j,  -6.+10.j,  -6.+10.j]],
+                                            [[-7. + 1.j,  -7. + 1.j,  -7. + 1.j],
+                                             [-7. + 1.j,  -7. + 1.j,  -7. + 1.j],
+                                             [-7. + 1.j,  -7. + 1.j,  -7. + 1.j]],
+                                            [[-8.+24.j,  -8.+24.j,  -8.+24.j],
+                                             [-8.+24.j,  -8.+24.j,  -8.+24.j],
+                                             [-8.+24.j,  -8.+24.j,  -8.+24.j]],
+                                            [[-9. - 5.j,  -9. - 5.j,  -9. - 5.j],
+                                             [-9. - 5.j,  -9. - 5.j,  -9. - 5.j],
+                                             [-9. - 5.j,  -9. - 5.j,  -9. - 5.j]],
+                                            [[-10. + 6.j, -10. + 6.j, -10. + 6.j],
+                                             [-10. + 6.j, -10. + 6.j, -10. + 6.j],
+                                             [-10. + 6.j, -10. + 6.j, -10. + 6.j]],
+                                            [[-11. - 3.j, -11. - 3.j, -11. - 3.j],
+                                             [-11. - 3.j, -11. - 3.j, -11. - 3.j],
+                                             [-11. - 3.j, -11. - 3.j, -11. - 3.j]],
+                                            [[-12.+20.j, -12.+20.j, -12.+20.j],
+                                             [-12.+20.j, -12.+20.j, -12.+20.j],
+                                             [-12.+20.j, -12.+20.j, -12.+20.j]],
+                                            [[-13. - 9.j, -13. - 9.j, -13. - 9.j],
+                                             [-13. - 9.j, -13. - 9.j, -13. - 9.j],
+                                             [-13. - 9.j, -13. - 9.j, -13. - 9.j]],
+                                            [[-14. + 2.j, -14. + 2.j, -14. + 2.j],
+                                             [-14. + 2.j, -14. + 2.j, -14. + 2.j],
+                                             [-14. + 2.j, -14. + 2.j, -14. + 2.j]],
+                                            [[-15. - 7.j, -15. - 7.j, -15. - 7.j],
+                                             [-15. - 7.j, -15. - 7.j, -15. - 7.j],
+                                             [-15. - 7.j, -15. - 7.j, -15. - 7.j]],
+                                            [[-16.+16.j, -16.+16.j, -16.+16.j],
+                                             [-16.+16.j, -16.+16.j, -16.+16.j],
+                                             [-16.+16.j, -16.+16.j, -16.+16.j]]], dtype=COMPLEX_TYPE)
+
+        np.testing.assert_array_equal(expected_auxiliary_wave, auxiliary_wave.get(),
+                                      err_msg="The auxiliary_wave has not been updated as expected")
+
+
+    def test_build_aux_same_as_exit_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = np.zeros_like(exit_wave)
+        auxiliary_wave_dev = cp.zeros_like(exit_wave_dev)
+        
+        ## Act
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        AWK = AuxiliaryWaveKernel(self.stream)
+        alpha_set = FLOAT_TYPE(.75)
+
+        AWK.build_aux(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=alpha_set)
+        nAWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set)
+        
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(),
+                                      err_msg="The gpu auxiliary_wave does not look the same as the numpy version")
+
+    def test_build_aux2_same_as_exit_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = np.zeros_like(exit_wave)
+        auxiliary_wave_dev = cp.zeros_like(exit_wave_dev)
+        
+        ## Act
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        AWK = AuxiliaryWaveKernel(self.stream)
+        alpha_set = FLOAT_TYPE(.75)
+
+        AWK.build_aux2(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=alpha_set)
+        nAWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set)
+        
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(),
+                                      err_msg="The gpu auxiliary_wave does not look the same as the numpy version")
+
+    def test_build_exit_aux_same_as_exit_REGRESSION(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = cp.zeros_like(exit_wave_dev)
+        
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        alpha_set = 1.0
+        AWK.build_exit(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev)
+
+        ## Assert
+        expected_auxiliary_wave = np.array([[[0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j]],
+                                            [[0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j]],
+                                            [[0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j]],
+                                            [[0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j]],
+                                            [[0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j]],
+                                            [[0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j]],
+                                            [[0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j]],
+                                            [[0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j]],
+                                            [[0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j]],
+                                            [[0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j]],
+                                            [[0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j]],
+                                            [[0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j]],
+                                            [[0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j],
+                                             [0. - 2.j, 0. - 2.j, 0. - 2.j]],
+                                            [[0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j],
+                                             [0. - 8.j, 0. - 8.j, 0. - 8.j]],
+                                            [[0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j],
+                                             [0. - 4.j, 0. - 4.j, 0. - 4.j]],
+                                            [[0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j],
+                                             [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=COMPLEX_TYPE)
+
+        np.testing.assert_array_equal(expected_auxiliary_wave, auxiliary_wave_dev.get(),
+                                      err_msg="The auxiliary_wave has not been updated as expected")
+
+        expected_exit_wave = np.array([[[1. - 1.j,  1. - 1.j,  1. - 1.j],
+                                        [1. - 1.j,  1. - 1.j,  1. - 1.j],
+                                        [1. - 1.j,  1. - 1.j,  1. - 1.j]],
+                                       [[2. - 6.j,  2. - 6.j,  2. - 6.j],
+                                        [2. - 6.j,  2. - 6.j,  2. - 6.j],
+                                        [2. - 6.j,  2. - 6.j,  2. - 6.j]],
+                                       [[3. - 1.j,  3. - 1.j,  3. - 1.j],
+                                        [3. - 1.j,  3. - 1.j,  3. - 1.j],
+                                        [3. - 1.j,  3. - 1.j,  3. - 1.j]],
+                                       [[4. - 12.j,  4. - 12.j,  4. - 12.j],
+                                        [4. - 12.j,  4. - 12.j,  4. - 12.j],
+                                        [4. - 12.j,  4. - 12.j,  4. - 12.j]],
+                                       [[5. + 3.j,  5. + 3.j,  5. + 3.j],
+                                        [5. + 3.j,  5. + 3.j,  5. + 3.j],
+                                        [5. + 3.j,  5. + 3.j,  5. + 3.j]],
+                                       [[6. - 2.j,  6. - 2.j,  6. - 2.j],
+                                        [6. - 2.j,  6. - 2.j,  6. - 2.j],
+                                        [6. - 2.j,  6. - 2.j,  6. - 2.j]],
+                                       [[7. + 3.j,  7. + 3.j,  7. + 3.j],
+                                        [7. + 3.j,  7. + 3.j,  7. + 3.j],
+                                        [7. + 3.j,  7. + 3.j,  7. + 3.j]],
+                                       [[8. - 8.j,  8. - 8.j,  8. - 8.j],
+                                        [8. - 8.j,  8. - 8.j,  8. - 8.j],
+                                        [8. - 8.j,  8. - 8.j,  8. - 8.j]],
+                                       [[9. + 7.j,  9. + 7.j,  9. + 7.j],
+                                        [9. + 7.j,  9. + 7.j,  9. + 7.j],
+                                        [9. + 7.j,  9. + 7.j,  9. + 7.j]],
+                                       [[10. + 2.j, 10. + 2.j, 10. + 2.j],
+                                        [10. + 2.j, 10. + 2.j, 10. + 2.j],
+                                        [10. + 2.j, 10. + 2.j, 10. + 2.j]],
+                                       [[11. + 7.j, 11. + 7.j, 11. + 7.j],
+                                        [11. + 7.j, 11. + 7.j, 11. + 7.j],
+                                        [11. + 7.j, 11. + 7.j, 11. + 7.j]],
+                                       [[12. - 4.j, 12. - 4.j, 12. - 4.j],
+                                        [12. - 4.j, 12. - 4.j, 12. - 4.j],
+                                        [12. - 4.j, 12. - 4.j, 12. - 4.j]],
+                                       [[13. + 11.j, 13. + 11.j, 13. + 11.j],
+                                        [13. + 11.j, 13. + 11.j, 13. + 11.j],
+                                        [13. + 11.j, 13. + 11.j, 13. + 11.j]],
+                                       [[14. + 6.j, 14. + 6.j, 14. + 6.j],
+                                        [14. + 6.j, 14. + 6.j, 14. + 6.j],
+                                        [14. + 6.j, 14. + 6.j, 14. + 6.j]],
+                                       [[15. + 11.j, 15. + 11.j, 15. + 11.j],
+                                        [15. + 11.j, 15. + 11.j, 15. + 11.j],
+                                        [15. + 11.j, 15. + 11.j, 15. + 11.j]],
+                                       [[16. + 0.j, 16. + 0.j, 16. + 0.j],
+                                        [16. + 0.j, 16. + 0.j, 16. + 0.j],
+                                        [16. + 0.j, 16. + 0.j, 16. + 0.j]]], dtype=COMPLEX_TYPE)
+
+        np.testing.assert_array_equal(expected_exit_wave, exit_wave_dev.get(),
+                                      err_msg="The exit_wave has not been updated as expected")
+
+    def test_build_exit_aux_same_as_exit_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = np.zeros_like(exit_wave)
+        auxiliary_wave_dev = cp.zeros_like(exit_wave_dev)
+
+        ## Act
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        AWK = AuxiliaryWaveKernel(self.stream)
+
+        AWK.build_exit(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev)
+        nAWK.build_exit(auxiliary_wave, addr, object_array, probe, exit_wave)
+
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(),
+                                      err_msg="The gpu auxiliary_wave does not look the same as the numpy version")
+
+        np.testing.assert_array_equal(exit_wave, exit_wave_dev.get(),
+                                      err_msg="The gpu exit_wave does not look the same as the numpy version")
+
+    def test_build_aux_no_ex_noadd_REGRESSION(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = cp.zeros_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, 
+            fac=1.0, add=False)
+
+        ## Assert
+        expected_auxiliary_wave = np.array([[[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]],
+                                            [[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]],
+                                            [[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]],
+                                            [[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]]], dtype=np.complex64)
+        np.testing.assert_array_equal(auxiliary_wave.get(), expected_auxiliary_wave,
+                                      err_msg="The auxiliary_wave has not been updated as expected")
+
+    def test_build_aux_no_ex_noadd_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = cp.zeros_like(exit_wave_dev)
+        auxiliary_wave = np.zeros_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, 
+            fac=1.0, add=False)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False)
+
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
+                                      err_msg="The auxiliary_wave does not match numpy")
+
+    def test_build_aux2_no_ex_noadd_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = cp.zeros_like(exit_wave_dev)
+        auxiliary_wave = np.zeros_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, 
+            fac=1.0, add=False)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False)
+
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
+                                      err_msg="The auxiliary_wave does not match numpy")
+
+
+    def test_build_aux_no_ex_add_REGRESSION(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = cp.ones_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        fac = 2.0
+        AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=fac, add=True)
+
+        ## Assert
+        expected_auxiliary_wave = np.array([[[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]],
+                                            [[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]],
+                                            [[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]],
+                                            [[0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j],
+                                             [0. + 2.j, 0. + 2.j, 0. + 2.j]],
+                                            [[0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j],
+                                             [0. + 8.j, 0. + 8.j, 0. + 8.j]],
+                                            [[0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j],
+                                             [0. + 4.j, 0. + 4.j, 0. + 4.j]],
+                                            [[0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j],
+                                             [0. + 16.j, 0. + 16.j, 0. + 16.j]]], dtype=np.complex64)
+        expected_auxiliary_wave = fac*expected_auxiliary_wave + 1
+        np.testing.assert_array_equal(auxiliary_wave.get(), expected_auxiliary_wave,
+                                      err_msg="The auxiliary_wave has not been updated as expected")
+
+    def test_build_aux_no_ex_add_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = cp.ones_like(exit_wave_dev)
+        auxiliary_wave = np.ones_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, 
+            fac=2.0, add=True)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True)
+
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
+                                      err_msg="The auxiliary_wave does not match numpy")
+
+    def test_build_aux2_no_ex_add_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays()
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = cp.ones_like(exit_wave_dev)
+        auxiliary_wave = np.ones_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, 
+            fac=2.0, add=True)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True)
+
+        ## Assert
+        np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave,
+                                      err_msg="The auxiliary_wave does not match numpy")
+
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_build_aux_no_ex_performance(self):
+        addr, object_array, probe, exit_wave = self.prepare_arrays(performance=True)
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = cp.zeros_like(exit_wave)
+
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, 
+            fac=1.0, add=False)
+
+
+    def test_build_exit_alpha_tau_REGRESSION(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1)
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = cp.zeros_like(exit_wave)
+
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave)
+
+        # Assert
+        expected_auxiliary_wave = np.array(
+                [[[0. -2.j, 0. -2.j, 0. -2.j],
+                [0. -2.j, 0. -2.j, 0. -2.j],
+                [0. -2.j, 0. -2.j, 0. -2.j]],
+
+                [[0. -8.j, 0. -8.j, 0. -8.j],
+                [0. -8.j, 0. -8.j, 0. -8.j],
+                [0. -8.j, 0. -8.j, 0. -8.j]],
+
+                [[0. -4.j, 0. -4.j, 0. -4.j],
+                [0. -4.j, 0. -4.j, 0. -4.j],
+                [0. -4.j, 0. -4.j, 0. -4.j]],
+
+                [[0.-16.j, 0.-16.j, 0.-16.j],
+                [0.-16.j, 0.-16.j, 0.-16.j],
+                [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64)
+        np.testing.assert_allclose(auxiliary_wave.get(), expected_auxiliary_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The auxiliary_wave has not been updated as expected")
+
+        expected_exit_wave = np.array(
+                [[[1. -1.j, 1. -1.j, 1. -1.j],
+                [1. -1.j, 1. -1.j, 1. -1.j],
+                [1. -1.j, 1. -1.j, 1. -1.j]],
+
+                [[2. -6.j, 2. -6.j, 2. -6.j],
+                [2. -6.j, 2. -6.j, 2. -6.j],
+                [2. -6.j, 2. -6.j, 2. -6.j]],
+
+                [[3. -1.j, 3. -1.j, 3. -1.j],
+                [3. -1.j, 3. -1.j, 3. -1.j],
+                [3. -1.j, 3. -1.j, 3. -1.j]],
+
+                [[4.-12.j, 4.-12.j, 4.-12.j],
+                [4.-12.j, 4.-12.j, 4.-12.j],
+                [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64)
+        np.testing.assert_allclose(exit_wave.get(), expected_exit_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The exit_wave has not been updated as expected")
+                              
+    def test_build_exit_alpha_tau_UNITY(self):
+        ## Arrange
+        addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1)
+        addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave_dev = cp.ones_like(exit_wave_dev)
+        auxiliary_wave = np.ones_like(exit_wave)
+        
+        ## Act
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_exit_alpha_tau(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=0.8, tau=0.6)
+        from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel
+        nAWK = npAuxiliaryWaveKernel()
+        nAWK.allocate()
+        nAWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6)
+
+        ## Assert
+        np.testing.assert_allclose(auxiliary_wave_dev.get(), auxiliary_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The auxiliary_wave does not match numpy")
+        ## Assert
+        np.testing.assert_allclose(exit_wave_dev.get(), exit_wave, rtol=1e-6, atol=1e-6,
+                                      err_msg="The exit_wave does not match numpy")
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_build_exit_alpha_tau_performance(self):
+        addr, object_array, probe, exit_wave = self.prepare_arrays(performance=True, scan_points=1)
+        addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave)
+        auxiliary_wave = cp.zeros_like(exit_wave)
+
+        AWK = AuxiliaryWaveKernel(self.stream)
+        AWK.allocate()
+        AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py
new file mode 100644
index 000000000..d2235539b
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py
@@ -0,0 +1,330 @@
+import unittest
+import numpy as np
+from . import perfrun, CupyCudaTest, have_cupy
+from ptypy.accelerate.base import array_utils as au
+from ptypy.utils.math_utils import delxf, delxb 
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.array_utils import DerivativesKernel
+
+class DerivativesKernelTest(CupyCudaTest):
+
+    def test_delxf_1dim(self):
+        inp = np.array([0, 1, 2, 4, 8, 0, 6], dtype=np.float32)
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev)
+
+        outp[:] = outp_dev.get()
+
+        exp = np.array([1, 1, 2, 4, -8, 6, 0], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxf_1dim_inplace(self):
+        inp = np.array([0, 1, 2, 4, 8, 0, 6], dtype=np.float32)
+        inp_dev = cp.asarray(inp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=inp_dev)
+
+        outp = inp_dev.get()
+
+        exp = np.array([1, 1, 2, 4, -8, 6, 0], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxf_2dim1(self):
+        inp  = np.array([
+            [0, 2, 6],
+            [1, -4, 5]
+        ], dtype=np.float32)
+
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=0)
+
+        outp[:] = outp_dev.get()
+
+
+        exp = np.array([
+            [1, -6, -1],
+            [0, 0, 0]
+        ], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxf_2dim2(self):
+        inp  = np.array([
+            [0, 2, 6],
+            [1, -4, 5]
+        ], dtype=np.float32)
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=1)
+
+        outp[:] = outp_dev.get()
+
+        exp = np.array([
+            [2, 4, 0],
+            [-5, 9, 0]
+        ], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxb_1dim(self):
+        inp = np.array([0, 1, 2, 4, 8, 0, 6], dtype=np.float32)
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxb(inp_dev, out=outp_dev)
+
+        outp[:] = outp_dev.get()
+
+        exp = np.array([0, 1, 1, 2, 4, -8, 6], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxb_2dim1(self):
+        inp  = np.array([
+            [0, 2, 6],
+            [1, -4, 5]
+        ], dtype=np.float32)
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxb(inp_dev, out=outp_dev, axis=0)
+
+        outp[:] = outp_dev.get()
+
+
+        exp = np.array([
+            [0, 0, 0],
+            [1, -6, -1],
+        ], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxb_2dim2(self):
+        inp  = np.array([
+            [0, 2, 6],
+            [1, -4, 5]
+        ], dtype=np.float32)
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxb(inp_dev, out=outp_dev, axis=1)
+
+        outp[:] = outp_dev.get()
+
+
+        exp = np.array([
+            [0, 2, 4],
+            [0, -5, 9]
+        ], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxf_2dim2complex(self):
+        inp = np.array([
+            [0, 2, 6],
+            [1, -4, 5]
+        ],dtype=np.float32) + 1j * np.array([
+            [0, 4, 12],
+            [2, -8, 10]
+        ],dtype=np.float32)
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=1)
+
+        outp[:] = outp_dev.get()
+
+        exp = np.array([
+            [2, 4, 0],
+            [-5, 9, 0]
+        ], dtype=np.float32) + 1j * np.array([
+            [4, 8, 0],
+            [-10, 18, 0]
+        ], dtype=np.float32)
+        np.testing.assert_array_equal(outp, exp)
+        
+    def test_delxf_3dim2(self):
+        inp = np.array([
+            [
+                [1, 2, 4,],
+                [7, 11, 16,],
+            ],
+            [
+                [22, 29, 37,],
+                [46, 56, 67]
+            ]
+        ], dtype=np.float32)
+        
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=1)
+
+        outp[:] = outp_dev.get()
+
+        exp = np.array([
+            [
+                [6, 9, 12,],
+                [0, 0, 0,],
+            ],
+            [
+                [24, 27, 30,],
+                [0, 0, 0],
+            ]
+        ], dtype=np.float32)
+
+        np.testing.assert_array_equal(outp, exp)
+
+    def test_delxf_3dim1_unity(self):
+        inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32)
+        
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=0)
+        outp[:] = outp_dev.get()
+
+        exp = delxf(inp, axis=0)
+        np.testing.assert_array_almost_equal(outp, exp)
+
+    def test_delxf_3dim2_unity1(self):
+        inp = np.array([
+            [ [1], [2], [4]],
+            [ [8], [16], [32]]
+        ], dtype=np.float32)
+       
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=1)
+        outp[:] = outp_dev.get()
+
+        exp = delxf(inp, axis=1)
+
+        np.testing.assert_array_almost_equal(np.squeeze(outp), np.squeeze(exp))
+
+    def test_delxf_3dim2_unity2(self):
+        inp = np.array([
+            [ [1, 2], [4, 7], [11,16] ],
+            [ [22,29], [37,46], [56,67]]
+        ], dtype=np.float32)
+       
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=1)
+        outp[:] = outp_dev.get()
+
+        exp = delxf(inp, axis=1)
+
+        np.testing.assert_array_almost_equal(np.squeeze(outp), np.squeeze(exp))
+
+    def test_delxf_3dim2_unity(self):
+        inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32)
+        
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=1)
+        outp[:] = outp_dev.get()
+
+        exp = delxf(inp, axis=1)
+        np.testing.assert_array_almost_equal(outp, exp)
+
+    def test_delxf_3dim3_unity(self):
+        inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32)
+        
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=2)
+        outp[:] = outp_dev.get()
+
+        exp = delxf(inp, axis=2)
+        np.testing.assert_array_almost_equal(outp, exp)
+
+    def test_delxb_3dim3_unity(self):
+        inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32)
+
+        inp_dev = cp.asarray(inp)
+        outp = np.zeros_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxb(inp_dev, out=outp_dev, axis=2)
+        outp[:] = outp_dev.get()
+
+        exp = delxb(inp, axis=2)
+        np.testing.assert_array_almost_equal(outp, exp)
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_perf_3d_0(self):
+        shape = [500, 1024, 1024]
+        inp = np.ones(shape, dtype=np.complex64)
+        inp_dev = cp.asarray(inp)
+        outp = np.ones_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=0)
+        outp[:] = outp_dev.get()
+        np.testing.assert_array_equal(outp, 0)
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_perf_3d_1(self):
+        shape = [500, 1024, 1024]
+        inp = np.ones(shape, dtype=np.complex64)
+        inp_dev = cp.asarray(inp)
+        outp = np.ones_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=1)
+        outp[:] = outp_dev.get()
+        np.testing.assert_array_equal(outp, 0)
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_perf_3d_2(self):
+        shape = [500, 1024, 1024]
+        inp = np.ones(shape, dtype=np.complex64)
+        inp_dev = cp.asarray(inp)
+        outp = np.ones_like(inp)
+        outp_dev = cp.asarray(outp)
+
+        DK = DerivativesKernel(inp.dtype, queue=self.stream)
+        DK.delxf(inp_dev, out=outp_dev, axis=2)
+        outp[:] = outp_dev.get()
+        np.testing.assert_array_equal(outp, 0)
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py b/test/accelerate_tests/cuda_cupy_tests/engine_tests.py
new file mode 100644
index 000000000..fe70b58bc
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/engine_tests.py
@@ -0,0 +1,172 @@
+"""
+Test for the ML engine.
+
+This file is part of the PTYPY package.
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import unittest
+
+from test import utils as tu
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines("cupy")
+import tempfile
+import shutil
+import numpy as np
+
+class MLCupyTest(unittest.TestCase):
+
+    def setUp(self):
+        self.outpath = tempfile.mkdtemp(suffix="ML_cupy_test")
+
+    def tearDown(self):
+        shutil.rmtree(self.outpath)
+
+    def check_engine_output(self, output, plotting=False, debug=False, scan="MF"):
+        key = "S%sG00" %scan
+        P_ML_serial, P_ML_cupy = output
+        numiter = len(P_ML_serial.runtime["iter_info"])
+        LL_ML_serial = np.array([P_ML_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        LL_ML_cupy = np.array([P_ML_cupy.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        crop = 42
+        OBJ_ML_serial, OBJ_ML_cupy = P_ML_serial.obj.S[key].data[0,crop:-crop,crop:-crop], P_ML_cupy.obj.S[key].data[0,crop:-crop,crop:-crop]
+        PRB_ML_serial, PRB_ML_cupy = P_ML_serial.probe.S[key].data[0], P_ML_cupy.probe.S[key].data[0]
+        MED_ML_serial = np.median(np.angle(OBJ_ML_serial))
+        MED_ML_cupy = np.median(np.angle(OBJ_ML_cupy))
+        eng_ML_serial = P_ML_serial.engines["engine00"]
+        eng_ML_cupy = P_ML_cupy.engines["engine00"]
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.figure("ML serial debug")
+            plt.imshow(np.abs(eng_ML_serial.debug))
+            plt.figure("ML cupy debug")
+            plt.imshow(np.abs(eng_ML_cupy.debug))
+            plt.show()
+
+        if plotting:
+            import matplotlib.pyplot as plt
+            plt.figure("Errors")
+            plt.plot(LL_ML_serial, label="ML_serial")
+            plt.plot(LL_ML_cupy, label="ML_cupy")
+            plt.legend()
+            plt.show()
+            plt.figure("Phase ML serial")
+            plt.imshow(np.angle(OBJ_ML_serial*np.exp(-1j*MED_ML_serial)))
+            plt.figure("Ampltitude ML serial")
+            plt.imshow(np.abs(OBJ_ML_serial))
+            plt.figure("Phase ML cupy")
+            plt.imshow(np.angle(OBJ_ML_cupy*np.exp(-1j*MED_ML_cupy)))
+            plt.figure("Amplitude ML cupy")
+            plt.imshow(np.abs(OBJ_ML_cupy))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(OBJ_ML_cupy) - np.angle(OBJ_ML_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(OBJ_ML_cupy) - np.abs(OBJ_ML_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+        # np.testing.assert_allclose(eng_ML_serial.debug, eng_ML_cupy.debug, atol=1e-7, rtol=1e-7,
+        #                             err_msg="The debug arrays are not matching as expected")
+        RMSE_ob = (np.mean(np.abs(OBJ_ML_cupy - OBJ_ML_serial)**2))
+        RMSE_pr = (np.mean(np.abs(PRB_ML_cupy - PRB_ML_serial)**2))
+        # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, 
+                                    err_msg="The object arrays are not matching as expected")
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, 
+                                    err_msg="The object arrays are not matching as expected")
+        # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7,
+        #                             err_msg="The log-likelihood errors are not matching as expected")
+    
+    def test_ML_cupy_base(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_ML_cupy_regularizer(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_ML_cupy_preconditioner(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_ML_cupy_floating(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = True
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_ML_cupy_smoothing_regularizer(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 200
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = False
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = False
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_ML_cupy_all(self):
+        out = []
+        for eng in ["ML_serial", "ML_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 100
+            engine_params.floating_intensities = False
+            engine_params.reg_del2 = True
+            engine_params.reg_del2_amplitude = 1.
+            engine_params.smooth_gradient = 20
+            engine_params.smooth_gradient_decay = 1/10.
+            engine_params.scale_precond = True
+            engine_params.scale_probe_object = 1e-6
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="info"))
+        self.check_engine_output(out, plotting=False, debug=False)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py b/test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py
new file mode 100644
index 000000000..017f85ba4
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py
@@ -0,0 +1,52 @@
+'''
+
+
+'''
+
+import unittest
+import numpy as np
+from . import perfrun, CupyCudaTest, have_cupy
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.engines.ML_cupy import Regul_del2_cupy
+    from ptypy.engines.ML import Regul_del2
+
+
+class EngineUtilsTest(CupyCudaTest):
+
+    def test_regul_del2_grad_unity(self):
+        ## Arrange
+        A = (np.random.randn(40,40)
+        +1j*np.random.randn(40,40)).astype(np.complex64)
+        A_dev = cp.asarray(A)
+
+        ## Act
+        Reg = Regul_del2(0.1)
+        Reg_dev = Regul_del2_cupy(0.1)
+        grad_dev = Reg_dev.grad(A_dev).get()
+        grad = Reg.grad(A)
+        #grad_dev = grad
+        ## Assert
+        np.testing.assert_allclose(grad_dev, grad, rtol=1e-7)
+        np.testing.assert_allclose(Reg_dev.LL, Reg.LL, rtol=1e-7)
+
+
+    def test_regul_del2_coeff_unity(self):
+        ## Arrange
+        A = (np.random.randn(40,40)
+        +1j*np.random.randn(40,40)).astype(np.complex64)
+        B = (np.random.randn(40,40)
+        +1j*np.random.randn(40,40)).astype(np.complex64)
+        A_dev = cp.asarray(A)
+        B_dev = cp.asarray(B)
+
+        ## Act
+        Reg = Regul_del2(0.1)
+        Reg_dev = Regul_del2_cupy(0.1)
+        d = Reg_dev.poly_line_coeffs(A_dev, B_dev)
+        c = Reg.poly_line_coeffs(A, B)
+        #grad_dev = grad
+        #d = c
+        ## Assert
+        np.testing.assert_allclose(c, d, rtol=1e-6)
diff --git a/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py
new file mode 100644
index 000000000..00d785859
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py
@@ -0,0 +1,204 @@
+'''
+
+
+'''
+
+import unittest
+import numpy as np
+from . import CupyCudaTest, have_cupy
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda, FFT_cupy
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+def get_forward_cuFFT(f, stream,
+                      pre_fft, post_fft, inplace, 
+                      symmetric, external=True):
+    if external:
+        return FFT_cuda(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace,
+                        symmetric=symmetric, forward=True).ft
+    else:
+        return FFT_cupy(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace,
+                        symmetric=symmetric, forward=True).ft
+
+def get_reverse_cuFFT(f, stream,
+                      pre_fft, post_fft, inplace, 
+                      symmetric, external=True):
+    if external:
+        return FFT_cuda(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace,
+                        symmetric=symmetric, forward=False).ift
+    else:
+        return FFT_cupy(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace,
+                        symmetric=symmetric, forward=False).ift
+
+
+
+class FftScalingTest(CupyCudaTest):
+
+    def get_input(self):
+        rows = cols = 32
+        batches = 1
+        f = np.ones(shape=(batches, rows, cols), dtype=COMPLEX_TYPE)
+        return f
+
+    #### Trivial foward transform tests ####
+
+    def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True):
+        f = self.get_input()
+        f_d = cp.asarray(f)
+        if preffact is not None:
+            pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64)
+            pref_d = cp.asarray(pref)
+        else:
+            preffact=1.0
+            pref_d = None
+        if postfact is not None:
+            post = postfact * np.ones(shape=f.shape[-2:], dtype=np.complex64)
+            post_d = cp.asarray(post)
+        else:
+            postfact=1.0
+            post_d = None
+        ft = factory(f, self.stream,
+                  pre_fft=pref_d, post_fft=post_d, inplace=True, 
+                symmetric=symmetric, external=external)
+        ft(f_d, f_d)
+        f_back = f_d.get()
+        elements = f.shape[-2] * f.shape[-1]
+        scale = 1.0 if not symmetric else 1.0 / np.sqrt(elements)
+        expected = elements * scale * preffact * postfact
+        self.assertAlmostEqual(f_back[0,0,0], expected)
+        np.testing.assert_array_almost_equal(f_back.flat[1:], 0)
+
+    def test_fwd_noscale_cufft(self):
+        self.fwd_test(False, get_forward_cuFFT)
+    
+    def test_fwd_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, external=False)
+
+    def test_fwd_scale_cufft(self):
+        self.fwd_test(True, get_forward_cuFFT)
+
+    def test_fwd_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, external=False)
+
+    def test_prefilt_fwd_noscale_cufft(self):
+        self.fwd_test(False, get_forward_cuFFT, preffact=2.0)
+
+    def test_prefilt_fwd_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, preffact=2.0, external=False)
+
+    def test_prefilt_fwd_scale_cufft(self):
+        self.fwd_test(True, get_forward_cuFFT, preffact=2.0)
+
+    def test_prefilt_fwd_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, preffact=2.0, external=False)
+
+    def test_postfilt_fwd_noscale_cufft(self):
+        self.fwd_test(False, get_forward_cuFFT, postfact=2.0)
+    
+    def test_postfilt_fwd_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, postfact=2.0, external=False)
+
+    def test_postfilt_fwd_scale_cufft(self):
+        self.fwd_test(True, get_forward_cuFFT, postfact=2.0)
+
+    def test_postfilt_fwd_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, postfact=2.0, external=False)
+
+    def test_prepostfilt_fwd_noscale_cufft(self):
+        self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5)
+    
+    def test_prepostfilt_fwd_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False)
+
+    def test_prepostfilt_fwd_scale_cufft(self):
+        self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5)
+
+    def test_prepostfilt_fwd_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False)
+
+
+    ############# Trivial inverse transform tests #########
+
+    def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True):
+        f = self.get_input()
+        f_d = cp.asarray(f)
+        if preffact is not None:
+            pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64)
+            pref_d = cp.asarray(pref)
+        else:
+            preffact=1.0
+            pref_d = None
+        if postfact is not None:
+            post = postfact * np.ones(shape=f.shape[-2:], dtype=np.complex64)
+            post_d = cp.asarray(post)
+        else:
+            postfact=1.0
+            post_d = None
+        ift = factory(f, self.stream,
+                pre_fft=pref_d, post_fft=post_d, inplace=True, symmetric=symmetric,
+                external=external)
+        ift(f_d, f_d)
+        f_back = f_d.get()
+        elements = f.shape[-2] * f.shape[-1]
+        scale = 1.0 if not symmetric else np.sqrt(elements)
+        expected = scale * preffact * postfact
+        self.assertAlmostEqual(f_back[0,0,0], expected)
+        np.testing.assert_array_almost_equal(f_back.flat[1:], 0)
+
+
+    def test_rev_noscale_cufft(self):
+        self.rev_test(False, get_reverse_cuFFT)
+
+    def test_rev_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, external=False)
+
+    def test_rev_scale_cufft(self):
+        self.rev_test(True, get_reverse_cuFFT)
+
+    def test_rev_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, external=False)
+
+    def test_prefilt_rev_noscale_cufft(self):
+        self.rev_test(False, get_reverse_cuFFT, preffact=1.5)
+
+    def test_prefilt_rev_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, preffact=1.5, external=False)
+
+    def test_prefilt_rev_scale_cufft(self):
+        self.rev_test(True, get_reverse_cuFFT, preffact=1.5)
+
+    def test_prefilt_rev_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, preffact=1.5, external=False)
+
+    def test_postfilt_rev_noscale_cufft(self):
+        self.rev_test(False, get_reverse_cuFFT, postfact=1.5)
+
+    def test_postfilt_rev_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, postfact=1.5, external=False)
+
+    def test_postfilt_rev_scale_cufft(self):
+        self.rev_test(True, get_reverse_cuFFT, postfact=1.5)
+
+    def test_postfilt_rev_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, postfact=1.5, external=False)
+
+    def test_prepostfilt_rev_noscale_cufft(self):
+        self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0)
+
+    def test_prepostfilt_rev_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False)
+
+    def test_prepostfilt_rev_scale_cufft(self):
+        self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0)
+
+    def test_prepostfilt_rev_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py b/test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py
new file mode 100644
index 000000000..ab6c92830
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py
@@ -0,0 +1,97 @@
+import unittest
+import numpy as np
+from . import CupyCudaTest, have_cupy
+import time
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda as cuFFT
+    from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as cupyCuFFT
+
+    COMPLEX_TYPE = np.complex64
+    FLOAT_TYPE = np.float32
+    INT_TYPE = np.int32
+
+class FftSetStreamTest(CupyCudaTest):
+
+    def helper(self, FFT):
+        f = np.ones(shape=(200, 128, 128), dtype=COMPLEX_TYPE)
+        t1 = time.time()
+        FW = FFT(f, self.stream, pre_fft=None, post_fft=None, inplace=True,
+            symmetric=True)
+        t2 = time.time()
+        dur1 = t2 - t1
+        with self.stream:
+            f_dev = cp.asarray(f)
+            self.stream.synchronize()
+
+        # measure with events to make sure that something actually 
+        # happened in the right stream
+        with self.stream:
+            ev1 = cp.cuda.Event()
+            ev2 = cp.cuda.Event()
+            rt1 = time.time()
+            ev1.record()
+        FW.ft(f_dev, f_dev)
+        with self.stream:
+            ev2.record()
+            ev1.synchronize()
+            ev2.synchronize()
+            self.stream.synchronize()
+            gput = cp.cuda.get_elapsed_time(ev1, ev2)*1e-3
+        rt2 = time.time()
+        cput = rt2-rt1
+        rel = 1-gput/cput
+
+        print('Origial: CPU={}, GPU={}, reldiff={}'.format(cput, gput, rel))
+
+        self.assertEqual(self.stream, FW.queue)
+        self.assertLess(rel, 0.3)  # max 30% diff
+        
+        stream2 = cp.cuda.Stream()
+
+        measure = False # measure time to set the stream
+        if measure:
+            avg = 100
+        else:
+            avg = 1
+        t1 = time.time()
+        for i in range(avg):
+            FW.queue = stream2
+        stream2.synchronize()
+        t2 = time.time()
+        dur2 = (t2 - t1)/avg
+        
+        with stream2:
+            ev1 = cp.cuda.Event()
+            ev2 = cp.cuda.Event()
+            rt1 = time.time()
+            ev1.record()
+        FW.ft(f_dev, f_dev)
+        with stream2:
+            ev2.record()
+            ev1.synchronize()
+            ev2.synchronize()
+            stream2.synchronize()
+            gput = cp.cuda.get_elapsed_time(ev1, ev2)*1e-3
+        self.stream.synchronize()
+        rt2 = time.time()
+        cput = rt2-rt1
+        rel = 1 - gput/cput
+
+        print('New: CPU={}, GPU={}, reldiff={}'.format(cput, gput, rel))
+
+        self.assertEqual(stream2, FW.queue)
+        self.assertLess(rel, 0.3)  # max 30% diff
+
+        if measure:
+            print('initial: {}, set_stream: {}'.format(dur1, dur2))
+            assert False 
+
+
+
+    def test_set_stream_b_cufft(self):
+        self.helper(cuFFT)
+
+    def test_set_stream_c_cupy_cufft(self):
+        self.helper(cupyCuFFT)
diff --git a/test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py
new file mode 100644
index 000000000..18fa82aa8
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py
@@ -0,0 +1,685 @@
+'''
+
+
+'''
+
+import unittest
+import numpy as np
+from . import CupyCudaTest, have_cupy
+
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.kernels import FourierUpdateKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class FourierUpdateKernelTest(CupyCudaTest):
+
+
+    def test_fmag_all_update_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number og object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0 # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        # print("address book is:")
+        # print(repr(addr))
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+
+        err_fmag = np.zeros(N, dtype=FLOAT_TYPE)
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        pbound_set = 0.9
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
+        nFUK.error_reduce(addr, err_fmag)
+        # print(np.sqrt(pbound_set/err_fmag))
+        f_d = cp.asarray(f)
+        fmag_d = cp.asarray(fmag)
+        mask_d = cp.asarray(mask)
+        err_fmag_d = cp.asarray(err_fmag)
+        addr_d = cp.asarray(addr)
+
+        # now set the state for both.
+
+        FUK.gpu.fdev = cp.asarray(nFUK.npy.fdev)
+        FUK.gpu.ferr = cp.asarray(nFUK.npy.ferr)
+
+        FUK.fmag_all_update(f_d, addr_d, fmag_d, mask_d, err_fmag_d, pbound=pbound_set)
+
+
+        nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set)
+        expected_f = f
+        measured_f = f_d.get()
+        np.testing.assert_allclose(expected_f, measured_f, rtol=1e-6, err_msg="Numpy f "
+                                                                      "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" %  (repr(expected_f),
+                                                                                                                               repr(measured_f),
+                                                                                                                               repr(mask)))
+
+    def test_fmag_update_nopbound_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number og object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0 # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        # print("address book is:")
+        # print(repr(addr))
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+
+        err_fmag = np.zeros(N, dtype=FLOAT_TYPE)
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
+        nFUK.error_reduce(addr, err_fmag)
+        # print(np.sqrt(pbound_set/err_fmag))
+        f_d = cp.asarray(f)
+        fmag_d = cp.asarray(fmag)
+        mask_d = cp.asarray(mask)
+        addr_d = cp.asarray(addr)
+
+        # now set the state for both.
+
+        FUK.gpu.fdev = cp.asarray(nFUK.npy.fdev)
+        FUK.gpu.ferr = cp.asarray(nFUK.npy.ferr)
+
+        FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d)
+        nFUK.fmag_update_nopbound(f, addr, fmag, mask)
+
+        expected_f = f
+        measured_f = f_d.get()
+        np.testing.assert_allclose(measured_f, expected_f, rtol=1e-6, err_msg="Numpy f "
+                                                                      "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" %  (repr(expected_f),
+                                                                                                                               repr(measured_f),
+                                                                                                                               repr(mask)))
+
+
+    def test_fourier_error_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number of object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C),
+                        dtype=FLOAT_TYPE)  # the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0  # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        f_d = cp.asarray(f)
+        fmag_d = cp.asarray(fmag)
+        mask_d = cp.asarray(mask)
+        addr_d = cp.asarray(addr)
+        mask_sum_d = cp.asarray(mask_sum)
+
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
+        FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d)
+
+        expected_fdev = nFUK.npy.fdev
+        measured_fdev = FUK.gpu.fdev.get()
+        np.testing.assert_allclose(expected_fdev, measured_fdev, rtol=1e-6, err_msg="Numpy fdev "
+                                                                            "is \n%s, \nbut gpu fdev is \n %s, \n " % (
+                                                                            repr(expected_fdev),
+                                                                            repr(measured_fdev)))
+
+        expected_ferr = nFUK.npy.ferr
+        measured_ferr = FUK.gpu.ferr.get()
+
+        np.testing.assert_array_equal(expected_ferr, measured_ferr, err_msg="Numpy ferr"
+                                                                            "is \n%s, \nbut gpu ferr is \n %s, \n " % (
+                                                                            repr(expected_ferr),
+                                                                            repr(measured_ferr)))
+    def test_fourier_deviation_UNITY(self):
+        '''
+        setup - using the fourier_error as reference, so we need mask, etc.
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number of object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C),
+                        dtype=FLOAT_TYPE)  # the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0  # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        f_d = cp.asarray(f)
+        fmag_d = cp.asarray(fmag)
+        addr_d = cp.asarray(addr)
+
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.fourier_deviation(f, addr, fmag)
+        FUK.fourier_deviation(f_d, addr_d, fmag_d)
+
+        expected_fdev = nFUK.npy.fdev
+        measured_fdev = FUK.gpu.fdev.get()
+        np.testing.assert_allclose(measured_fdev, expected_fdev,  rtol=1e-6, err_msg="Numpy fdev "
+                                                                            "is \n%s, \nbut gpu fdev is \n %s, \n " % (
+                                                                            repr(expected_fdev),
+                                                                            repr(measured_fdev)))
+
+
+
+    def test_error_reduce_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number og object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape).item()).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C),
+                        dtype=FLOAT_TYPE)  # the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0  # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        # print("address book is:")
+        # print(repr(addr))
+
+        '''
+        test
+        '''
+        err_fmag = np.zeros(N, dtype=FLOAT_TYPE)
+        mask_sum = mask.sum(-1).sum(-1)
+
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        f_d = cp.asarray(f)
+        fmag_d = cp.asarray(fmag)
+        mask_d = cp.asarray(mask)
+        addr_d = cp.asarray(addr)
+        err_fmag_d = cp.asarray(err_fmag)
+        mask_sum_d = cp.asarray(mask_sum)
+        pbound_set = 0.9
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes, queue_thread=self.stream)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.fourier_error(f, addr, fmag, mask, mask_sum)
+        nFUK.error_reduce(addr, err_fmag)
+        
+
+        FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d)
+        FUK.error_reduce(addr_d, err_fmag_d)
+
+        expected_err_fmag = err_fmag
+        measured_err_fmag = err_fmag_d.get()
+
+        np.testing.assert_allclose(expected_err_fmag, measured_err_fmag, rtol=1.15207385e-07,
+                                                                        err_msg="Numpy err_fmag"
+                                                                            "is \n%s, \nbut gpu err_fmag is \n %s, \n " % (
+                                                                            repr(expected_err_fmag),
+                                                                            repr(measured_err_fmag)))
+
+    def test_error_reduce(self):
+        # array from the previous test
+        ferr = np.array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                          [7.54033208e-01, 3.04839879e-01, 5.56465909e-02, 6.45330548e-03, 1.57260016e-01],
+                          [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                          [5.26210022e+00, 6.81290817e+00, 8.56371498e+00, 1.05145216e+01, 1.26653280e+01],
+                          [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
+
+                         [[1.61048353e+00, 2.15810299e+00, 2.78572226e+00, 3.49334168e+00, 4.28096104e+00],
+                          [5.14858055e+00, 6.09619951e+00, 7.12381887e+00, 8.23143768e+00, 9.41905785e+00],
+                          [1.06866770e+01, 1.20342960e+01, 1.34619150e+01, 1.49695349e+01, 1.65571537e+01],
+                          [1.82247734e+01, 1.99723930e+01, 2.18000126e+01, 2.37076321e+01, 2.56952515e+01],
+                          [2.77628708e+01, 2.99104881e+01, 3.21381073e+01, 3.44457283e+01, 3.68333473e+01]],
+
+                         [[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                          [6.31699409e+01, 6.82966690e+01, 7.36233978e+01, 7.91501160e+01, 8.48768463e+01],
+                          [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
+                          [1.23437180e+02, 1.30563919e+02, 1.37890640e+02, 1.45417374e+02, 1.53144089e+02],
+                          [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]],
+
+                         [[4.58764343e+01, 4.86257210e+01, 5.14550095e+01, 5.43642960e+01, 5.73535805e+01],
+                          [6.04228668e+01, 6.35721550e+01, 6.68014374e+01, 7.01107254e+01, 7.35000076e+01],
+                          [7.69692993e+01, 8.05185852e+01, 8.41478729e+01, 8.78571548e+01, 9.16464386e+01],
+                          [9.55157242e+01, 9.94650116e+01, 1.03494293e+02, 1.07603584e+02, 1.11792870e+02],
+                          [1.16062157e+02, 1.20411446e+02, 1.24840721e+02, 1.29350006e+02, 1.33939301e+02]]],
+                        dtype=FLOAT_TYPE)
+        # print(ferr.shape)
+        scan_pts = 2  # one dimensional scan point number
+        N = scan_pts ** 2
+
+        addr = np.zeros((N, 1, 5, 3))
+        aux = np.zeros((4, 5, 5))
+        FUK = FourierUpdateKernel(aux, nmodes=1)
+        err_mag = np.zeros(N, dtype=FLOAT_TYPE)
+        err_mag_d = cp.asarray(err_mag)
+        FUK.gpu.ferr = cp.asarray(ferr)
+        addr_d = cp.asarray(addr)
+
+        FUK.error_reduce(addr_d, err_mag_d)
+
+        # print(repr(ferr))
+        measured_err_mag = err_mag_d.get()
+
+        # print(repr(measured_err_mag))
+
+        expected_err_mag = np.array([45.096806,  388.54788, 1059.5702, 2155.6968], dtype=FLOAT_TYPE)
+
+        np.testing.assert_array_equal(expected_err_mag, measured_err_mag, err_msg="The fourier_update_kernel.error_reduce"
+                                                                   "is not behaving as expected.")
+
+
+    def log_likelihood_UNITY_tester(self, use_version2=False):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number of object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+
+        mask = np.empty(shape=(N, B, C),
+                        dtype=FLOAT_TYPE)  # the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0  # checkerboard for testing
+        mask[:] = mask_fill
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+        LLerr = np.zeros_like(mask_sum, dtype=np.float32)
+        f_d = cp.asarray(f)
+        fmag_d = cp.asarray(fmag)
+        mask_d = cp.asarray(mask)
+        addr_d = cp.asarray(addr)
+        LLerr_d = cp.asarray(LLerr)
+
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes)
+        nFUK.allocate()
+        nFUK.log_likelihood(f, addr, fmag, mask, LLerr)
+
+        FUK = FourierUpdateKernel(f, nmodes=total_number_modes)
+        FUK.allocate()
+        if use_version2:
+            FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d)
+        else:
+            FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d)
+
+        expected_err_phot = LLerr
+        measured_err_phot = LLerr_d.get()
+
+        np.testing.assert_allclose(expected_err_phot, measured_err_phot, err_msg="Numpy log-likelihood error "
+                                                                                 "is \n%s, \nbut gpu log-likelihood error is \n%s, \n " % (
+                                                                                 repr(expected_err_phot),
+                                                                                 repr(measured_err_phot)), rtol=1e-5)
+    def test_log_likelihood_UNITY(self):
+        self.log_likelihood_UNITY_tester(False)
+
+    def test_log_likelihood2_UNITY(self):
+        self.log_likelihood_UNITY_tester(True)
+
+    def test_exit_error_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number of object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        aux = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            aux[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel
+        aux_d = cp.asarray(aux)
+        addr_d = cp.asarray(addr)
+
+        nFUK = npFourierUpdateKernel(aux, nmodes=total_number_modes)
+        FUK = FourierUpdateKernel(aux, nmodes=total_number_modes)
+
+        nFUK.allocate()
+        FUK.allocate()
+
+        nFUK.exit_error(aux, addr, )
+        FUK.exit_error(aux_d, addr_d)
+
+        expected_ferr = nFUK.npy.ferr
+        measured_ferr = FUK.gpu.ferr.get()
+
+        np.testing.assert_allclose(expected_ferr, measured_ferr, err_msg="Numpy ferr"
+                                                                            "is \n%s, \nbut gpu ferr is \n %s, \n " % (
+                                                                            repr(expected_ferr),
+                                                                            repr(measured_ferr)), rtol=1e-7)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/gpudata_test.py b/test/accelerate_tests/cuda_cupy_tests/gpudata_test.py
new file mode 100644
index 000000000..5d1be4f8d
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/gpudata_test.py
@@ -0,0 +1,265 @@
+'''
+'''
+
+import unittest
+import numpy as np
+from . import CupyCudaTest, have_cupy
+
+if have_cupy():
+    import cupy as cp
+    import cupyx
+    from ptypy.accelerate.cuda_cupy.mem_utils import GpuData, GpuDataManager
+
+class GpuDataTest(CupyCudaTest):
+
+    def test_to_gpu_new(self):
+        # arrange
+        cpu = 2. * np.ones((5,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+        
+        # act
+        gpu = gdata.to_gpu(cpu, '1', self.stream)
+        self.stream.synchronize()
+
+        # assert
+        np.testing.assert_array_equal(cpu, gpu.get())
+    
+    def test_to_gpu_sameid(self):
+        # arrange
+        cpu = 2. * np.ones((5,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+
+        # act
+        gpu1 = gdata.to_gpu(cpu, '1', self.stream)
+        cpu *= 2.
+        gpu2 = gdata.to_gpu(cpu, '1', self.stream)
+        self.stream.synchronize()
+
+        # assert
+        np.testing.assert_array_equal(gpu1.get(), gpu2.get())
+        
+    def test_to_gpu_new_syncback(self):
+        # arrange
+        cpu = 2. * np.ones((5,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=True)
+
+        # act
+        gpu1 = gdata.to_gpu(cpu, '1', self.stream)
+        with self.stream:
+            gpu1.fill(np.float32(3.))
+        cpu2 = 2. * cpu
+        gpu2 = gdata.to_gpu(cpu2, '2', self.stream)
+        self.stream.synchronize()
+
+        # assert
+        np.testing.assert_array_equal(cpu, 3.)
+        np.testing.assert_array_equal(gpu2.get(), cpu2)
+
+    def test_to_gpu_new_nosyncback(self):
+        # arrange
+        cpu = 2. * np.ones((5,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+
+        # act
+        gpu1 = gdata.to_gpu(cpu, '1', self.stream)
+        with self.stream:
+            gpu1.fill(np.float32(3.))
+        cpu2 = 2. * cpu
+        gpu2 = gdata.to_gpu(cpu2, '2', self.stream)
+        self.stream.synchronize()
+
+        # assert
+        np.testing.assert_array_equal(cpu, 2.)
+        np.testing.assert_array_equal(gpu2.get(), cpu2)
+
+    def test_from_gpu(self):
+        # arrange
+        cpu = 2. * np.ones((5,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+
+        # act
+        gpu1 = gdata.to_gpu(cpu, '1', self.stream)
+        with self.stream:
+            gpu1.fill(np.float32(3.))
+        gdata.from_gpu(self.stream)
+        self.stream.synchronize()
+
+    def test_data_variable_size(self):
+        # arrange
+        cpu = np.ones((2,5), dtype=np.float32)
+        cpu2 = 2. * np.ones((1,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+
+        # act
+        gpu = gdata.to_gpu(cpu, '1', self.stream)
+        gpu2 = gdata.to_gpu(cpu2, '2', self.stream)
+        self.stream.synchronize()
+
+        # assert
+        np.testing.assert_array_equal(gpu2.get(), cpu2)
+        self.assertEqual(cpu2.nbytes, gpu2.nbytes)
+        np.testing.assert_array_equal(gpu.get(), np.array([
+            [2, 2, 2, 2, 2],
+            [1, 1, 1, 1, 1]
+        ], dtype=np.float32))
+
+    def test_data_variable_size_raise(self):
+        # arrange
+        cpu = np.ones((1,5), dtype=np.float32)
+        cpu2 = np.ones((2,4), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+
+        # act/assert
+        with self.assertRaises(Exception):
+            gdata.to_gpu(cpu2, '1', self.stream)
+
+    def test_data_resize_raise(self):
+        # arrange
+        cpu = np.ones((5,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+        gpu = gdata.to_gpu(cpu, '1', self.stream)
+        cpu2 = np.ones((10,5), dtype=np.float32)
+
+        # act
+        gdata.resize(cpu2.nbytes)
+        gpu2 = gdata.to_gpu(cpu2, '1', self.stream)
+
+        # assert
+        self.assertEqual(gdata.gpuId, '1')
+        self.assertEqual(gdata.nbytes, cpu2.nbytes)
+        self.assertEqual(gpu2.size, cpu2.size)
+        self.assertGreaterEqual(gdata.nbytes_buffer, cpu2.nbytes)
+
+    def test_data_resize_shrink(self):
+        # arrange
+        cpu = np.ones((5,5), dtype=np.float32)
+        gdata = GpuData(cpu.nbytes, syncback=False)
+        gpu = gdata.to_gpu(cpu, '1', self.stream)
+        cpu2 = np.ones((4,6), dtype=np.float32)
+
+        # act
+        gdata.resize(cpu2.nbytes)
+        gpu2 = gdata.to_gpu(cpu2, '1', self.stream)
+
+        # assert
+        self.assertEqual(gdata.gpuId, '1')
+        self.assertEqual(gdata.nbytes, cpu2.nbytes)
+        self.assertEqual(gpu2.size, cpu2.size)
+        self.assertGreaterEqual(gdata.nbytes_buffer, cpu2.nbytes)
+
+    def test_datamanager_memory(self):
+        # arrange / act
+        gdm = GpuDataManager(128, 4)
+        gdm.reset(124, 3)
+
+        # assert
+        self.assertEqual(gdm.memory, 3*128)
+        self.assertEqual(gdm.nbytes, 124)
+
+    def test_datamanager_free(self):
+        # arrange
+        gdm = GpuDataManager(128, 2)
+
+        # act
+        gdm.free()
+
+        # assert
+        self.assertEqual(gdm.memory, 0)
+
+    def test_datamanager_newids(self):
+        # arrange
+        cpu1 = 2. * np.ones((5,5), dtype=np.float32)
+        cpu2 = 2. * cpu1  # 4
+        cpu3 = 2. * cpu2  # 8
+        cpu4 = 2. * cpu3  # 16
+        gdm = GpuDataManager(cpu1.nbytes, 4, syncback=False)
+
+        # act
+        gpu1 = gdm.to_gpu(cpu1, '1', self.stream)[1]
+        gpu2 = gdm.to_gpu(cpu2, '2', self.stream)[1]
+        gpu11 = gdm.to_gpu(-1.*cpu1, '1', self.stream)[1]
+        gpu21 = gdm.to_gpu(-1.*cpu4, '2', self.stream)[1]
+        gpu3 = gdm.to_gpu(cpu3, '3', self.stream)[1]
+        gpu31 = gdm.to_gpu(-1.*cpu1, '3', self.stream)[1]
+        gpu4 = gdm.to_gpu(cpu4, '4', self.stream)[1]
+        gpu41 = gdm.to_gpu(-1.*cpu1, '4', self.stream)[1]
+        self.stream.synchronize()
+
+        # assert
+        np.testing.assert_array_equal(cpu1, gpu1.get())
+        np.testing.assert_array_equal(cpu1, gpu11.get())
+        np.testing.assert_array_equal(cpu1, 2.)
+        np.testing.assert_array_equal(cpu2, gpu2.get())
+        np.testing.assert_array_equal(cpu2, gpu21.get())
+        np.testing.assert_array_equal(cpu2, 4.)
+        np.testing.assert_array_equal(cpu3, gpu3.get())
+        np.testing.assert_array_equal(cpu3, gpu31.get())
+        np.testing.assert_array_equal(cpu3, 8.)
+        np.testing.assert_array_equal(cpu4, gpu4.get())
+        np.testing.assert_array_equal(cpu4, gpu41.get())
+        np.testing.assert_array_equal(cpu4, 16.)
+
+    def test_datamanager_syncback(self):
+        # arrange
+        cpu1 = 2. * np.ones((5,5), dtype=np.float32)
+        cpu2 = 2. * cpu1  # 4
+        cpu3 = 2. * cpu2  # 8
+        cpu4 = 2. * cpu3  # 16
+        gdm = GpuDataManager(cpu1.nbytes, 2, syncback=True)
+
+        # act
+        gpu1 = gdm.to_gpu(cpu1, '1', self.stream)[1]
+        gpu2 = gdm.to_gpu(cpu2, '2', self.stream)[1]
+        with self.stream:
+            gpu1.fill(np.float32(3.))
+            gpu2.fill(np.float32(5.))
+        gpu3 = gdm.to_gpu(cpu3, '3', self.stream)[1]
+        with self.stream:
+            gpu3.fill(np.float32(7.))
+        gpu4 = gdm.to_gpu(cpu4, '4', self.stream)[1]
+        with self.stream:
+            gpu4.fill(np.float32(9.))
+        gdm.syncback = False
+        gpu5 = gdm.to_gpu(cpu4*.2, '5', self.stream)[1]
+        gpu6 = gdm.to_gpu(cpu4*.4, '6', self.stream)[1]
+        self.stream.synchronize()
+
+        # assert
+        np.testing.assert_array_equal(cpu1, 3.)
+        np.testing.assert_array_equal(cpu2, 5.)
+        np.testing.assert_array_equal(cpu3, 8.)
+        np.testing.assert_array_equal(cpu4, 16.)
+
+    def test_data_synctransfer(self):
+        # arrange
+        sh = (1024, 1024, 1)  # 4MB
+        cpu1 = cupyx.zeros_pinned(sh, np.float32, order="C")
+        cpu2 = cupyx.zeros_pinned(sh, np.float32, order="C")
+        cpu1[:] = 1.
+        cpu2[:] = 2.
+        gdata = GpuData(cpu1.nbytes, syncback=True)
+        # long-running kernel
+        knl = """
+        extern "C" __global__ void tfill(float* d, int sz, float dval) {
+            for (int i = 0; i < sz; ++i) 
+                d[i] = dval;
+        }
+        """
+        tfill = cp.RawKernel(knl, "tfill")
+        
+        # act
+        s2 = cp.cuda.Stream()
+        gpu1 = gdata.to_gpu(cpu1, '1', self.stream)
+        with s2:
+            tfill(grid=(1,1,1), block=(1,1,1), args=(gpu1, np.int32(gpu1.size), np.float32(2.)))
+        gdata.record_done(self.stream)  # it will fail without this
+        gpu2 = gdata.to_gpu(cpu2, '2', s2)
+        with s2:
+            tfill(grid=(1,1,1), block=(1,1,1), args=(gpu1, np.int32(gpu2.size), np.float32(4.)))
+        gdata.from_gpu(s2)
+        self.stream.synchronize()
+        s2.synchronize()
+        
+        # assert
+        np.testing.assert_array_equal(cpu1, 2.)
+        np.testing.assert_array_equal(cpu2, 4.)
diff --git a/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py
new file mode 100644
index 000000000..b3a6e8ff7
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py
@@ -0,0 +1,327 @@
+'''
+
+
+'''
+
+import unittest
+import numpy as np
+from . import perfrun, CupyCudaTest, have_cupy
+
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.kernels import GradientDescentKernel
+
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+
+class GradientDescentKernelTest(CupyCudaTest):
+
+    def prepare_arrays(self, performance=False):
+        if not performance:
+            nmodes = 2
+            N_buf = 4 
+            N = 3 
+            A = 3 
+        else:
+            nmodes = 4
+            N_buf = 100
+            N = 80
+            A =  512
+        i_sh = (N, A, A)
+        e_sh = (N*nmodes, A, A)
+        f_sh = (N_buf, A, A)
+        a_sh = (N_buf * nmodes, A, A)
+        w = np.ones(i_sh, dtype=FLOAT_TYPE)
+        for idx, sl in enumerate(w):
+            sl[idx % A, idx % A] = 0.0
+        X, Y, Z = np.indices(a_sh, dtype=COMPLEX_TYPE)
+        b_f = X + 1j * Y
+        b_a = Y + 1j * Z
+        b_b = Z + 1j * X
+        err_sum = np.zeros((N,), dtype=FLOAT_TYPE)
+        fic = np.ones((N,), dtype=FLOAT_TYPE)
+        addr = np.zeros((N, nmodes, 5, 3), dtype=INT_TYPE)
+        I = np.empty(i_sh, dtype=FLOAT_TYPE)
+        I[:] = np.round(np.abs(b_f[:N])**2 % 20)
+        for pos_idx in range(N):
+            for mode_idx in range(nmodes):
+                exit_idx = pos_idx * nmodes + mode_idx
+                addr[pos_idx, mode_idx] = np.array([[mode_idx, 0, 0],
+                                                    [0, 0, 0],
+                                                    [exit_idx, 0, 0],
+                                                    [pos_idx, 0, 0],
+                                                    [pos_idx, 0, 0]], dtype=INT_TYPE)
+        return (cp.asarray(b_f),
+                cp.asarray(b_a),
+                cp.asarray(b_b),
+                cp.asarray(I),
+                cp.asarray(w),
+                cp.asarray(err_sum),
+                cp.asarray(addr),
+                cp.asarray(fic))
+
+    def test_allocate(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+
+    def test_make_model(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
+
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_model(b_f, addr)
+
+        exp_Imodel = np.array([[[1.,  1.,  1.],
+                                [3.,  3.,  3.],
+                                [9.,  9.,  9.]],
+
+                               [[13., 13., 13.],
+                                [15., 15., 15.],
+                                [21., 21., 21.]],
+
+                               [[41., 41., 41.],
+                                [43., 43., 43.],
+                                [49., 49., 49.]],
+
+                               [[85., 85., 85.],
+                                [87., 87., 87.],
+                                [93., 93., 93.]]], dtype=FLOAT_TYPE)
+
+        np.testing.assert_array_almost_equal(
+            exp_Imodel, GDK.gpu.Imodel.get(),
+            err_msg="`Imodel` buffer has not been updated as expected")
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_make_model_performance(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
+
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_model(b_f, addr)
+
+    def test_floating_intensity(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
+        GDK=GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.gpu.Imodel[0] = I[0] * 3.
+        GDK.gpu.Imodel[1] = I[1] * 2.
+        GDK.gpu.Imodel[2] = I[2]
+        GDK.floating_intensity(addr, w, I, fic)
+        #print('Imodel',repr(GDK.gpu.Imodel))
+        #print('fic',repr(1./fic))
+        exp_Imodel = np.array([[[0., 0., 0.],
+                [1., 1., 1.],
+                [4., 4., 4.]],
+
+               [[1., 1., 1.],
+                [2., 2., 2.],
+                [5., 5., 5.]],
+
+               [[4., 4., 4.],
+                [5., 5., 5.],
+                [8., 8., 8.]],
+
+               [[0., 0., 0.],
+                [0., 0., 0.],
+                [0., 0., 0.]]], dtype=np.float32)
+        exp_fic=1./np.array([3., 2., 1.], dtype=np.float32)
+        np.testing.assert_array_almost_equal(exp_Imodel, GDK.gpu.Imodel.get(),
+            err_msg="`Imodel` buffer has not been updated as expected")
+        np.testing.assert_array_almost_equal(exp_fic, fic.get(),
+            err_msg="floating intensity coeff (fic) has not been updated as expected")
+
+    def test_make_a012(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_a012(b_f, b_a, b_b, addr, I, fic)
+
+        exp_A0 = np.array([[[1.,  1.,  1.],
+                            [2.,  2.,  2.],
+                            [5.,  5.,  5.]],
+
+                           [[12., 12., 12.],
+                            [13., 13., 13.],
+                            [16., 16., 16.]],
+
+                           [[37., 37., 37.],
+                            [38., 38., 38.],
+                            [41., 41., 41.]],
+
+                           [[0.,  0.,  0.],
+                            [0.,  0.,  0.],
+                            [0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A0, GDK.gpu.Imodel.get(),
+            err_msg="`Imodel` buffer (=A0) has not been updated as expected")
+
+        exp_A1 = np.array([[[0., 0., 0.],
+                            [2., 6., 10.],
+                            [4., 12., 20.]],
+
+                           [[0., 0., 0.],
+                            [10., 14., 18.],
+                            [20., 28., 36.]],
+
+                           [[0., 0., 0.],
+                            [18., 22., 26.],
+                            [36., 44., 52.]],
+
+                           [[0., 0., 0.],
+                            [0., 0., 0.],
+                            [0., 0., 0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A1, GDK.gpu.LLerr.get(),
+            err_msg="`LLerr` buffer (=A1) has not been updated as expected")
+
+        exp_A2 = np.array([[[0., 4., 12.],
+                            [4., 8., 16.],
+                            [12., 16., 24.]],
+
+                           [[0., 12., 28.],
+                            [12., 24., 40.],
+                            [28., 40., 56.]],
+
+                           [[0., 20., 44.],
+                            [20., 40., 64.],
+                            [44., 64., 88.]],
+
+                           [[0., 0., 0.],
+                            [0., 0., 0.],
+                            [0., 0., 0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_A2, GDK.gpu.LLden.get(),
+            err_msg="`LLden` buffer (=A2) has not been updated as expected")
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_make_a012_performance(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
+
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_a012(b_f, b_a, b_b, addr, I, fic)
+
+    def test_fill_b(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
+        Brenorm = 0.35
+        B = np.zeros((3,), dtype=FLOAT_TYPE)
+        B_dev = cp.asarray(B)
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_a012(b_f, b_a, b_b, addr, I, fic)
+        GDK.fill_b(addr, Brenorm, w, B_dev)
+        B[:] = B_dev.get()
+
+        exp_B = np.array([ 4699.8,  5398.4, 13398.], dtype=FLOAT_TYPE)
+        np.testing.assert_allclose(
+            B, exp_B,
+            rtol=1e-7,
+            err_msg="`B` has not been updated as expected")
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_fill_b_perf(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
+        Brenorm = 0.35
+        B = np.zeros((3,), dtype=FLOAT_TYPE)
+        B_dev = cp.asarray(B)
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.make_a012(b_f, b_a, b_b, addr, I, fic)
+        GDK.fill_b(addr, Brenorm, w, B_dev)
+    
+    def test_error_reduce(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.npy.LLerr = np.indices(GDK.gpu.LLerr.shape, dtype=FLOAT_TYPE)[0]
+        GDK.gpu.LLerr = cp.asarray(GDK.npy.LLerr)
+        GDK.error_reduce(addr, err_sum)
+
+        exp_err = np.array([0.,  9., 18.], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_err, err_sum.get(),
+            err_msg="`err_sum` has not been updated as expected")
+    
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_error_reduce_perf(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.npy.LLerr = np.indices(GDK.gpu.LLerr.shape, dtype=FLOAT_TYPE)[0]
+        GDK.gpu.LLerr = cp.asarray(GDK.npy.LLerr)
+        GDK.error_reduce(addr, err_sum)
+
+    def test_main(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays()
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.main(b_f, addr, w, I)
+
+        exp_b_f = np.array([[[0. + 0.j,   0. + 0.j,   0. + 0.j],
+                             [-0. - 1.j,  -0. - 1.j,  -0. - 1.j],
+                             [-0. - 8.j,  -0. - 8.j,  -0. - 8.j]],
+
+                            [[0. + 0.j,   0. + 0.j,   0. + 0.j],
+                             [-1. - 1.j,  -1. - 1.j,  -1. - 1.j],
+                             [-4. - 8.j,  -4. - 8.j,  -4. - 8.j]],
+
+                            [[-2. + 0.j,  -2. + 0.j,  -2. + 0.j],
+                             [-4. - 2.j,  -0. + 0.j,  -4. - 2.j],
+                             [-10.-10.j, -10.-10.j, -10.-10.j]],
+
+                            [[-3. + 0.j,  -3. + 0.j,  -3. + 0.j],
+                             [-6. - 2.j,  -0. + 0.j,  -6. - 2.j],
+                             [-15.-10.j, -15.-10.j, -15.-10.j]],
+
+                            [[-16. + 0.j, -16. + 0.j, -16. + 0.j],
+                             [-20. - 5.j, -20. - 5.j, -20. - 5.j],
+                             [-32.-16.j, -32.-16.j,  -0. + 0.j]],
+
+                            [[-20. + 0.j, -20. + 0.j, -20. + 0.j],
+                             [-25. - 5.j, -25. - 5.j, -25. - 5.j],
+                             [-40.-16.j, -40.-16.j,  -0. + 0.j]],
+
+                            [[6. + 0.j,   6. + 0.j,   6. + 0.j],
+                             [6. + 1.j,   6. + 1.j,   6. + 1.j],
+                             [6. + 2.j,   6. + 2.j,   6. + 2.j]],
+
+                            [[7. + 0.j,   7. + 0.j,   7. + 0.j],
+                             [7. + 1.j,   7. + 1.j,   7. + 1.j],
+                             [7. + 2.j,   7. + 2.j,   7. + 2.j]]], dtype=COMPLEX_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_b_f, b_f.get(),
+            err_msg="Auxiliary has not been updated as expected")
+
+        exp_LL = np.array([[[0.,  0.,  0.],
+                            [1.,  1.,  1.],
+                            [16., 16., 16.]],
+
+                           [[1.,  1.,  1.],
+                            [4.,  0.,  4.],
+                            [25., 25., 25.]],
+
+                           [[16., 16., 16.],
+                            [25., 25., 25.],
+                            [64., 64.,  0.]],
+
+                           [[0.,  0.,  0.],
+                            [0.,  0.,  0.],
+                            [0.,  0.,  0.]]], dtype=FLOAT_TYPE)
+        np.testing.assert_array_almost_equal(
+            exp_LL, GDK.gpu.LLerr.get(),
+            err_msg="LogLikelihood error has not been updated as expected")
+
+    @unittest.skipIf(not perfrun, "performance test")
+    def test_main_perf(self):
+        b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True)
+        GDK = GradientDescentKernel(b_f, addr.shape[1])
+        GDK.allocate()
+        GDK.main(b_f, addr, w, I)
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_cupy_tests/import_test.py b/test/accelerate_tests/cuda_cupy_tests/import_test.py
new file mode 100644
index 000000000..3af6dff3f
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/import_test.py
@@ -0,0 +1,10 @@
+"""
+Import test
+"""
+import unittest
+
+class AutoLoaderTest(unittest.TestCase):
+        
+    def test_load_engines_cupy(self):
+        import ptypy
+        ptypy.load_gpu_engines("cupy")
diff --git a/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
new file mode 100644
index 000000000..0c234d878
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py
@@ -0,0 +1,74 @@
+'''
+'''
+
+import unittest
+from mpi4py.MPI import Get_version
+import numpy as np
+from . import CupyCudaTest, have_cupy
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy import multi_gpu as mgpu
+    from ptypy.utils import parallel
+
+from pkg_resources import parse_version
+
+class GpuDataTest(CupyCudaTest):
+    """
+    This is a test class for MPI - to really check if it all works, it needs
+    to be run as:
+
+    mpirun -np 2 pytest multi_gpu_test.py
+
+    For CUDA-aware MPI testing, currently the environment variable
+
+    OMPI_MCA_opal_cuda_support=true
+
+    needs to be set, mpi4py version 3.1.0+ used, and a cuda-aware MPI version.
+    """
+
+    def setUp(self):
+        if parallel.rank_local <  cp.cuda.runtime.getDeviceCount():
+            self.device = cp.cuda.Device(parallel.rank_local)
+            self.device.use()
+        else:
+            self.device = None
+
+    @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0")
+    def test_version(self):
+        v1 = parse_version("3.1.0")
+        v2 = parse_version(parse_version("3.1.0a").base_version)
+
+        self.assertGreaterEqual(v2, v1)
+
+    def test_compute_mode(self):
+        attr = cp.cuda.Device().attributes
+        self.assertIn("ComputeMode", attr)
+        mode = attr["ComputeMode"]
+        self.assertIn(mode, [0, 1, 2, 3])
+
+    def multigpu_tester(self, com):
+        if self.device is None:
+            return
+
+        data = np.ones((2, 1), dtype=np.float32)
+        data_dev = cp.asarray(data)
+        sz = parallel.size
+        com.allReduceSum(data_dev)
+
+        out = data_dev.get()
+        np.testing.assert_allclose(out, sz * data, rtol=1e-6)
+
+    def test_multigpu_auto(self):
+        self.multigpu_tester(mgpu.get_multi_gpu_communicator())
+        
+    def test_multigpu_mpi(self):
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi())
+
+    @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available")
+    def test_multigpu_cudampi(self):
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi())
+
+    @unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
+    def test_multigpu_nccl(self):
+        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
new file mode 100644
index 000000000..8a41bad35
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
@@ -0,0 +1,943 @@
+'''
+
+
+'''
+
+import unittest
+import numpy as np
+from . import CupyCudaTest, have_cupy
+from ptypy.accelerate.base.array_utils import max_abs2
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.kernels import PoUpdateKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+
+class PoUpdateKernelTest(CupyCudaTest):
+
+    def prepare_arrays(self, scan_points=None):
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  # object size y
+        I = C + npts_greater_than  # object size x
+
+        if scan_points is None:
+            scan_pts = 2  # one dimensional scan point number
+        else:
+            scan_pts = scan_points
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
+        for idx in range(G):
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
+
+        probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
+        for idx in range(D):
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
+
+        return (cp.asarray(addr), 
+            cp.asarray(object_array), 
+            cp.asarray(object_array_denominator), 
+            cp.asarray(probe), 
+            cp.asarray(exit_wave), 
+            cp.asarray(probe_denominator))
+
+
+    def test_init(self):
+        POUK = PoUpdateKernel()
+        np.testing.assert_equal(POUK.kernels, ['pr_update', 'ob_update'],
+                                err_msg='PoUpdateKernel does not have the correct functions registered.')
+
+    def ob_update_REGRESSION_tester(self, atomics=True):
+        
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
+        for idx in range(G):
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2)
+
+
+        POUK = PoUpdateKernel()
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        # print("object array denom before:")
+        # print(object_array_denominator)
+        object_array_dev = cp.asarray(object_array)
+        object_array_denominator_dev = cp.asarray(object_array_denominator)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1)))
+            addr_dev = cp.asarray(addr2)
+        else:
+            addr_dev = cp.asarray(addr)
+
+        print(object_array_denominator)
+        POUK.ob_update(addr_dev, object_array_dev, object_array_denominator_dev, probe_dev, exit_wave_dev, atomics=atomics)
+        print("\n\n cuda  version")
+        print(object_array_denominator_dev.get())
+        nPOUK.ob_update(addr, object_array, object_array_denominator, probe, exit_wave)
+        print("\n\n numpy version")
+        print(object_array_denominator)
+
+
+
+        expected_object_array = np.array([[[15.+1.j, 53.+1.j, 53.+1.j, 53.+1.j, 53.+1.j, 39.+1.j, 1.+1.j],
+                                           [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j],
+                                           [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j],
+                                           [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j],
+                                           [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j],
+                                           [63.+1.j, 149.+1.j, 149.+1.j, 149.+1.j, 149.+1.j, 87.+1.j, 1.+1.j],
+                                           [1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j]],
+                                          [[24. + 4.j, 68. + 4.j, 68. + 4.j, 68. + 4.j, 68. + 4.j, 48. + 4.j, 4. + 4.j],
+                                           [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j],
+                                           [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j],
+                                           [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j],
+                                           [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j],
+                                           [72. + 4.j, 164. + 4.j, 164. + 4.j, 164. + 4.j, 164. + 4.j,  96. + 4.j, 4. + 4.j],
+                                           [4. + 4.j,  4. + 4.j,   4. + 4.j,   4. + 4.j,   4. + 4.j,   4. + 4.j,   4. + 4.j]]],
+                                         dtype=COMPLEX_TYPE)
+
+
+        np.testing.assert_array_equal(object_array, expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+        expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [22., 42., 42., 42., 42., 22.,  2.],
+                                                       [12., 22., 22., 22., 22., 12.,  2.],
+                                                       [ 2.,  2.,  2.,  2.,  2.,  2.,  2.]],
+
+                                                      [[17., 27., 27., 27., 27., 17.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [27., 47., 47., 47., 47., 27.,  7.],
+                                                       [17., 27., 27., 27., 27., 17.,  7.],
+                                                       [ 7.,  7.,  7.,  7.,  7.,  7.,  7.]]],
+                                                     dtype=FLOAT_TYPE)
+
+
+        np.testing.assert_array_equal(object_array_denominator_dev.get(), expected_object_array_denominator,
+                                      err_msg="The object array denominatorhas not been updated as expected")
+
+
+    def test_ob_update_atomics_REGRESSION(self):
+        self.ob_update_REGRESSION_tester(atomics=True)
+
+    def test_ob_update_tiled_REGRESSION(self):
+        self.ob_update_REGRESSION_tester(atomics=False)
+
+    def ob_update_UNITY_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
+        for idx in range(G):
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
+
+
+        POUK = PoUpdateKernel()
+
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+
+        object_array_dev = cp.asarray(object_array)
+        object_array_denominator_dev = cp.asarray(object_array_denominator)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1)))
+            addr_dev = cp.asarray(addr2)
+        else:
+            addr_dev = cp.asarray(addr)
+
+        # print(object_array_denominator)
+        POUK.ob_update(addr_dev, object_array_dev, object_array_denominator_dev, probe_dev, exit_wave_dev, atomics=atomics)
+        # print("\n\n cuda  version")
+        # print(repr(object_array_dev.get()))
+        # print(repr(object_array_denominator_dev.get()))
+        nPOUK.ob_update(addr, object_array, object_array_denominator, probe, exit_wave)
+        # print("\n\n numpy version")
+        # print(repr(object_array_denominator))
+        # print(repr(object_array))
+
+
+        np.testing.assert_array_equal(object_array, object_array_dev.get(),
+                                      err_msg="The object array has not been updated as expected")
+
+
+        np.testing.assert_array_equal(object_array_denominator, object_array_denominator_dev.get(),
+                                      err_msg="The object array denominatorhas not been updated as expected")
+
+
+    def test_ob_update_atomics_UNITY(self):
+        self.ob_update_UNITY_tester(atomics=True)
+    
+    def test_ob_update_tiled_UNITY(self):
+        self.ob_update_UNITY_tester(atomics=False)
+
+    def pr_update_REGRESSION_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  # object size y
+        I = C + npts_greater_than  # object size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
+        for idx in range(D):
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
+
+        POUK = PoUpdateKernel()
+
+        # print("probe array before:")
+        # print(repr(probe))
+        # print("probe denominator array before:")
+        # print(repr(probe_denominator))
+
+        object_array_dev = cp.asarray(object_array)
+        probe_denominator_dev = cp.asarray(probe_denominator)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1)))
+            addr_dev = cp.asarray(addr2)
+        else:
+            addr_dev = cp.asarray(addr)
+
+
+        POUK.pr_update(addr_dev, probe_dev, probe_denominator_dev, object_array_dev, exit_wave_dev, atomics=atomics)
+
+        # print("probe array after:")
+        # print(repr(probe))
+        # print("probe denominator array after:")
+        # print(repr(probe_denominator))
+        expected_probe = np.array([[[313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j],
+                                    [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j],
+                                    [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j],
+                                    [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j],
+                                    [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j]],
+
+                                   [[394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j],
+                                    [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j],
+                                    [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j],
+                                    [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j],
+                                    [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j]]],
+                                  dtype=COMPLEX_TYPE)
+
+        np.testing.assert_array_equal(probe_dev.get(), expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+        expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.],
+                                                [138., 138., 138., 138., 138.]],
+
+                                               [[143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.],
+                                                [143., 143., 143., 143., 143.]]],
+                                              dtype=FLOAT_TYPE)
+
+        np.testing.assert_array_equal(probe_denominator_dev.get(), expected_probe_denominator,
+                                      err_msg="The probe denominatorhas not been updated as expected")
+
+
+    def test_pr_update_atomics_REGRESSION(self):
+        self.pr_update_REGRESSION_tester(atomics=True)
+
+    def test_pr_update_tiled_REGRESSION(self):
+        self.pr_update_REGRESSION_tester(atomics=False)
+
+    def pr_update_UNITY_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  # object size y
+        I = C + npts_greater_than  # object size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):  #
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
+        for idx in range(D):
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2)
+
+        POUK = PoUpdateKernel()
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+
+        # print("probe array before:")
+        # print(repr(probe))
+        # print("probe denominator array before:")
+        # print(repr(probe_denominator))
+
+        object_array_dev = cp.asarray(object_array)
+        probe_denominator_dev = cp.asarray(probe_denominator)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1)))
+            addr_dev = cp.asarray(addr2)
+        else:
+            addr_dev = cp.asarray(addr)
+
+
+        POUK.pr_update(addr_dev, probe_dev, probe_denominator_dev, object_array_dev, exit_wave_dev, atomics=atomics)
+        nPOUK.pr_update(addr, probe, probe_denominator, object_array, exit_wave)
+
+        # print("probe array after:")
+        # print(repr(probe))
+        # print("probe denominator array after:")
+        # print(repr(probe_denominator))
+
+        np.testing.assert_array_equal(probe, probe_dev.get(),
+                                      err_msg="The probe has not been updated as expected")
+
+        np.testing.assert_array_equal(probe_denominator, probe_denominator_dev.get(),
+                                      err_msg="The probe denominatorhas not been updated as expected")
+
+
+    def test_pr_update_atomics_UNITY(self):
+        self.pr_update_UNITY_tester(atomics=True)
+
+    def test_pr_update_tiled_UNITY(self):
+        self.pr_update_UNITY_tester(atomics=False)
+
+
+    def pr_update_ML_tester(self, atomics=False):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = cp.asarray(addr2)
+
+        POUK.pr_update_ML(addr, probe, object_array, exit_wave, atomics=atomics)
+
+        expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j],
+                                    [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]],
+
+                                   [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j],
+                                    [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]],
+                                  dtype=COMPLEX_TYPE)
+
+        np.testing.assert_array_equal(probe.get(), expected_probe,
+                                      err_msg="The probe has not been updated as expected")
+
+    def test_pr_update_ML_atomics_REGRESSION(self):
+        self.pr_update_ML_tester(True)
+
+    def test_pr_update_ML_tiled_REGRESSION(self):
+        self.pr_update_ML_tester(False)
+
+    def ob_update_ML_tester(self, atomics=True):
+        '''
+        setup
+        '''
+        addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays()
+        '''
+        test
+        '''
+        POUK = PoUpdateKernel()
+
+        POUK.allocate()  # this doesn't do anything, but is the call pattern.
+
+        if not atomics:
+            addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1)))
+            addr = cp.asarray(addr2)
+
+        POUK.ob_update_ML(addr, object_array, probe, exit_wave, atomics=atomics)
+
+        expected_object_array = np.array(
+            [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j],
+              [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j],
+              [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]],
+
+             [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j],
+              [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j],
+              [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]],
+            dtype=COMPLEX_TYPE)
+
+        np.testing.assert_array_equal(object_array.get(), expected_object_array,
+                                      err_msg="The object array has not been updated as expected")
+
+    def test_ob_update_ML_atomics_REGRESSION(self):
+        self.ob_update_ML_tester(True)
+
+    def test_ob_update_ML_tiled_REGRESSION(self):
+        self.ob_update_ML_tester(False)
+
+    def test_ob_update_local_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 2
+
+        probe_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel(queue_thread=self.stream)
+
+        object_array_dev = cp.asarray(object_array)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        auxiliary_wave_dev = cp.asarray(auxiliary_wave)
+        probe_norm_dev = cp.asarray(probe_norm)
+        addr_dev = cp.asarray(addr)
+
+        POUK.pr_norm_local(addr_dev, probe_dev, probe_norm_dev)
+        POUK.ob_update_local(addr_dev, object_array_dev, probe_dev, exit_wave_dev, auxiliary_wave_dev, probe_norm_dev, a=0.5, b=0.5)
+        nPOUK.pr_norm_local(addr, probe, probe_norm)
+        nPOUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave, probe_norm, a=0.5, b=0.5)
+
+        np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6,
+                                      err_msg="The object array has not been updated as expected")
+
+    def test_pr_update_local_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 1.5
+
+        object_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = cp.asarray(object_array)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        auxiliary_wave_dev = cp.asarray(auxiliary_wave)
+        object_norm_dev = cp.asarray(object_norm)
+        addr_dev = cp.asarray(addr)
+
+        POUK.ob_norm_local(addr_dev, object_array_dev, object_norm_dev)
+        POUK.pr_update_local(addr_dev,  probe_dev, object_array_dev,exit_wave_dev, auxiliary_wave_dev, object_norm_dev, cp.max(object_norm_dev), a=0.5, b=0.5)
+        nPOUK.ob_norm_local(addr, object_array, object_norm)
+        nPOUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave, object_norm, object_norm.max(), a=0.5, b=0.5)
+
+        np.testing.assert_allclose(probe_dev.get(), probe, rtol=1e-6, atol=1e-6,
+                                      err_msg="The probe has not been updated as expected")
+
+    def test_ob_norm_local_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+        object_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel(queue_thread=self.stream)
+
+        object_array_dev = cp.asarray(object_array)
+        object_norm_dev = cp.asarray(object_norm)
+        addr_dev = cp.asarray(addr)
+
+        POUK.ob_norm_local(addr_dev, object_array_dev, object_norm_dev)
+        nPOUK.ob_norm_local(addr, object_array, object_norm)
+
+        np.testing.assert_allclose(object_norm_dev.get(), object_norm, rtol=1e-6, atol=1e-6,
+                                      err_msg="The object norm has not been updated as expected")
+
+    def test_pr_norm_local_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+        probe_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        probe_dev = cp.asarray(probe)
+        probe_norm_dev = cp.asarray(probe_norm) 
+        addr_dev = cp.asarray(addr)
+
+        POUK.pr_norm_local(addr_dev,  probe_dev, probe_norm_dev)
+        nPOUK.pr_norm_local(addr, probe, probe_norm)
+
+        np.testing.assert_allclose(probe_norm_dev.get(), probe_norm, rtol=1e-6, atol=1e-6,
+                                      err_msg="The probe norm has not been updated as expected")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py
new file mode 100644
index 000000000..7e817fa60
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py
@@ -0,0 +1,149 @@
+'''
+
+
+'''
+
+import unittest
+import numpy as np
+from . import CupyCudaTest, have_cupy
+from ptypy import utils as u
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.kernels import PositionCorrectionKernel
+    from ptypy.accelerate.base.kernels import PositionCorrectionKernel as abPositionCorrectionKernel
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+
+class PositionCorrectionKernelTest(CupyCudaTest):
+
+    def setUp(self):
+        CupyCudaTest.setUp(self)
+        self.params = u.Param()
+        self.params.nshifts = 4
+        self.params.method = "Annealing"
+        self.params.amplitude = 2e-9
+        self.params.start = 0
+        self.params.stop = 10
+        self.params.max_shift = 2e-9
+        self.params.amplitude_decay = True
+        self.resolution = [1e-9,1e-9]
+
+    def update_addr_and_error_state_UNITY_helper(self, size, modes):
+        ## Arrange
+        addr = np.ones((size, modes, 5, 3), dtype=np.int32)
+        mangled_addr = 2 * addr
+        err_state = np.zeros((size,), dtype=np.float32)
+        err_state[5:] = 2.
+        err_sum = np.ones((size, ), dtype=np.float32)
+        addr_gpu = cp.asarray(addr)
+        mangled_addr_gpu = cp.asarray(mangled_addr)
+        err_state_gpu = cp.asarray(err_state)
+        err_sum_gpu = cp.asarray(err_sum)
+        aux = np.ones((1,1,1), dtype=np.complex64)
+
+        ## Act
+        PCK = PositionCorrectionKernel(aux, modes, self.params, self.resolution, queue_thread=self.stream)
+        PCK.update_addr_and_error_state(addr_gpu, err_state_gpu, mangled_addr_gpu, err_sum_gpu)
+        abPCK = abPositionCorrectionKernel(aux, modes, self.params, self.resolution)
+        abPCK.update_addr_and_error_state(addr, err_state, mangled_addr, err_sum)
+
+        ## Assert
+        np.testing.assert_array_equal(addr_gpu.get(), addr)
+        np.testing.assert_array_equal(err_state_gpu.get(), err_state)
+
+    def test_update_addr_and_error_state_UNITY_small_onemode(self):
+        self.update_addr_and_error_state_UNITY_helper(4, 1)
+
+    def test_update_addr_and_error_state_UNITY_large_onemode(self):
+        self.update_addr_and_error_state_UNITY_helper(323, 1)
+    
+    def test_update_addr_and_error_state_UNITY_small_multimode(self):
+        self.update_addr_and_error_state_UNITY_helper(4, 3)
+
+    def test_update_addr_and_error_state_UNITY_large_multimode(self):
+        self.update_addr_and_error_state_UNITY_helper(323, 3)
+
+    def log_likelihood_ml_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        G = 2  # number of object modes
+
+        E = B  # probe size y
+        F = C  # probe size x
+
+        scan_pts = 2  # one dimensional scan point number
+
+        N = scan_pts ** 2
+        total_number_modes = G * D
+        A = N * total_number_modes  # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+
+        fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)  # the measured magnitudes NxAxB
+        fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype)
+        fmag[:] = fmag_fill
+        I = fmag**2
+
+        mask = np.empty(shape=(N, B, C),
+                        dtype=FLOAT_TYPE)  # the masks for the measured magnitudes either 1xAxB or NxAxB
+        mask_fill = np.ones_like(mask)
+        mask_fill[::2, ::2] = 0  # checkerboard for testing
+        mask[:] = mask_fill
+        w = mask /(I+1.)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((N,))
+        Y = Y.reshape((N,))
+
+        addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [position_idx, 0, 0],
+                                                             [position_idx, 0, 0]])
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+        '''
+        test
+        '''
+        mask_sum = mask.sum(-1).sum(-1)
+        LLerr = np.zeros_like(mask_sum, dtype=np.float32)
+        f_d = cp.asarray(f)
+        w_d = cp.asarray(w)
+        I_d = cp.asarray(I)
+        addr_d = cp.asarray(addr)
+        LLerr_d = cp.asarray(LLerr)
+
+        ## Act
+        PCK = PositionCorrectionKernel(f, total_number_modes, self.params, self.resolution, queue_thread=self.stream)
+        abPCK = abPositionCorrectionKernel(f, total_number_modes, self.params, self.resolution)
+        abPCK.log_likelihood_ml(f, addr, I, w, LLerr)
+        PCK.log_likelihood_ml(f_d, addr_d, I_d, w_d, LLerr_d)
+
+        expected_err_phot = LLerr
+        measured_err_phot = LLerr_d.get()
+
+        np.testing.assert_allclose(expected_err_phot, measured_err_phot, err_msg="Numpy log-likelihood error "
+                                                                                 "is \n%s, \nbut gpu log-likelihood error is \n%s, \n " % (
+                                                                                 repr(expected_err_phot),
+                                                                                 repr(measured_err_phot)), rtol=1e-5)
diff --git a/test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py
new file mode 100644
index 000000000..c221d59c0
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py
@@ -0,0 +1,157 @@
+'''
+
+'''
+
+import numpy as np
+import ptypy.utils as u
+from . import CupyCudaTest, have_cupy
+
+if have_cupy():
+    import cupy as cp
+    from ptypy.accelerate.cuda_cupy.kernels import PropagationKernel
+
+from ptypy.core import geometry
+from ptypy.core import Base as theBase
+
+# subclass for dictionary access
+Base = type('Base',(theBase,),{})
+
+COMPLEX_TYPE = np.complex64
+FLOAT_TYPE = np.float32
+INT_TYPE = np.int32
+
+class PropagationKernelTest(CupyCudaTest):
+
+    def set_up_farfield(self,shape, resolution=None):
+        P = Base()
+        P.CType = COMPLEX_TYPE
+        P.Ftype = FLOAT_TYPE
+        g = u.Param()
+        g.energy = None # u.keV2m(1.0)/6.32e-7
+        g.lam = 5.32e-7
+        g.distance = 15e-2
+        g.psize = 24e-6
+        g.shape = shape
+        g.propagation = "farfield"
+        if resolution is not None:
+            g.resolution = resolution
+        G = geometry.Geo(owner=P, pars=g)
+        return G
+
+    def set_up_nearfield(self, shape):
+        P = Base()
+        P.CType = COMPLEX_TYPE
+        P.Ftype = FLOAT_TYPE
+        g = u.Param()
+        g.energy = None # u.keV2m(1.0)/6.32e-7
+        g.lam = 1e-10
+        g.distance = 1.0
+        g.psize = 100e-9
+        g.shape = shape
+        g.propagation = "nearfield"
+        G = geometry.Geo(owner=P, pars=g)
+        return G
+
+    def test_farfield_propagator_forward_UNITY(self):
+        # setup
+        SH = (2,16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[:,5:11,5:11] = 1. + 2j
+        aux_d = cp.asarray(aux)
+        geo = self.set_up_farfield(SH[1:])
+
+        # test
+        aux = geo.propagator.fw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.fw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+
+    def test_farfield_propagator_backward_UNITY(self):
+        # setup
+        SH = (2,16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[:,5:11,5:11] = 1. + 2j
+        aux_d = cp.asarray(aux)
+        geo = self.set_up_farfield(SH[1:])
+
+        # test
+        aux = geo.propagator.bw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.bw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+
+    def test_farfield_propagator_forward_crop_pad_UNITY(self):
+        # setup
+        SH = (2,16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[:,5:11,5:11] = 1. + 2j
+        aux_d = cp.asarray(aux)
+        geo = self.set_up_farfield(SH[1:])
+        geo = self.set_up_farfield(SH[1:], resolution=0.5*geo.resolution)
+
+        # test
+        aux = geo.propagator.fw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.fw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+
+    def test_farfield_propagator_backward_crop_pad_UNITY(self):
+        # setup
+        SH = (2,16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[:,5:11,5:11] = 1. + 2j
+        aux_d = cp.asarray(aux)
+        geo = self.set_up_farfield(SH[1:])
+        geo = self.set_up_farfield(SH[1:], resolution=0.5*geo.resolution)
+
+        # test
+        aux = geo.propagator.bw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.bw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+
+    def test_nearfield_propagator_forward_UNITY(self):
+        # setup
+        SH = (2,16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[:,5:11,5:11] = 1. + 2j
+        aux_d = cp.asarray(aux)
+        geo = self.set_up_nearfield(SH[1:])
+        
+        # test
+        aux = geo.propagator.fw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.fw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
+
+    def test_nearfield_propagator_backward_UNITY(self):
+        # setup
+        SH = (2,16,16)
+        aux = np.zeros((SH), dtype=COMPLEX_TYPE)
+        aux[:,5:11,5:11] = 1. + 2j
+        aux_d = cp.asarray(aux)
+        geo = self.set_up_nearfield(SH[1:])
+    
+        # test
+        aux = geo.propagator.bw(aux)
+        PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream)
+        PropK.allocate()
+        PropK.bw(aux_d, aux_d)
+
+        np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, 
+            err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get())))
\ No newline at end of file
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py
index 1220702b7..5816e3bf3 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py
@@ -25,6 +25,7 @@ def helper(self, FFT):
         t2 = time.time()
         dur1 = t2 - t1
         f_dev = gpuarray.to_gpu(f)
+        self.stream.synchronize()
 
         # measure with events to make sure that something actually 
         # happened in the right stream

From df073704eb5e6bdb3b609bbd1623ca83339b2ea2 Mon Sep 17 00:00:00 2001
From: Benedikt Daurer <benedikt.daurer@diamond.ac.uk>
Date: Tue, 24 Jan 2023 16:42:27 +0000
Subject: [PATCH 03/37] bump version to 0.8

---
 ptypy/version.py |  6 +++---
 release_notes.md | 11 +++++++++++
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/ptypy/version.py b/ptypy/version.py
index 272c25708..ee3909a1b 100644
--- a/ptypy/version.py
+++ b/ptypy/version.py
@@ -1,7 +1,7 @@
 
-short_version = '0.7.0'
-version = '0.7.0'
-release = True
+short_version = '0.8.0'
+version = '0.8.0'
+release = False
 
 if not release:
     version += '.dev'
diff --git a/release_notes.md b/release_notes.md
index 61ea173d2..e2fd647fc 100644
--- a/release_notes.md
+++ b/release_notes.md
@@ -1,3 +1,14 @@
+# PtyPy 0.8 release notes
+
+An alternative CUDA implementation based on [`cupy`](https://cupy.dev/) 
+has been implemented, providing the same feature as the `PyCuda` based
+engine. 
+It can be imported using
+```python
+import ptypy
+ptypy.load_gpu_engines('cupy')
+```
+
 # PtyPy 0.7 release notes
 
 This release is focused on improving the usability of PtyPy in Jupyter notebooks in preparation for the 

From 1bb839745cfb210cfab9e07ba565be713aa883b5 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Wed, 15 Feb 2023 14:50:21 +0000
Subject: [PATCH 04/37] non-threaded autoplotting (Jupyter) should only be on
 when autoplot is active (#480)

---
 ptypy/core/ptycho.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index 36ed97a28..926aad919 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -703,7 +703,7 @@ def run(self, label=None, epars=None, engine=None):
                                 'Exit %.2e' % tuple(err))
                     imsg = '%(engine)s: Iteration # %(iteration)d/%(numiter)d :: ' %info + \
                                    'Fourier %.2e, Photons %.2e, Exit %.2e' %tuple(err)
-                    if not self.p.io.autoplot.threaded:
+                    if (self.p.io.autoplot.active) and (not self.p.io.autoplot.threaded):
                         if not (info["iteration"] % self.p.io.autoplot.interval):
                             if self._jupyter_client is None:
                                 from IPython import display

From 1c9d832f807dd67d1d0ac0ff46688b9454902559 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Wed, 15 Feb 2023 14:51:23 +0000
Subject: [PATCH 05/37] SimSacn: reset diff storage to zeros (#479)

---
 ptypy/simulations/simscan.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ptypy/simulations/simscan.py b/ptypy/simulations/simscan.py
index 1785ff7b9..a8444f8e4 100644
--- a/ptypy/simulations/simscan.py
+++ b/ptypy/simulations/simscan.py
@@ -164,6 +164,10 @@ def __init__(self, pars=None, **kwargs):
         P = self.manipulate_ptycho(P)
         #############################################################
 
+        # Make sure all diff storages are empty
+        for name, storage in P.diff.S.items():
+            storage.data.fill(0.)
+
         # Simulate diffraction signal
         logger.info('Propagating exit waves.')
         for name,pod in P.pods.items():

From 4d46f5f0a18cc961599479922a19605a17d7e057 Mon Sep 17 00:00:00 2001
From: jsouter <107045742+jsouter@users.noreply.github.com>
Date: Wed, 22 Feb 2023 08:55:00 +0000
Subject: [PATCH 06/37] Load data during creation with SwmrLoader class (#428)

* Added SwmrLoader and introduced new logic for live processing
---------

Co-authored-by: Benedikt Daurer <benedikt.daurer@diamond.ac.uk>
---
 ptypy/core/manager.py           |   4 +
 ptypy/core/ptycho.py            |  20 ++++-
 ptypy/engines/base.py           |   1 +
 ptypy/experiment/hdf5_loader.py | 139 +++++++++++---------------------
 ptypy/experiment/swmr_loader.py | 125 ++++++++++++++++++++++++++++
 5 files changed, 197 insertions(+), 92 deletions(-)
 create mode 100644 ptypy/experiment/swmr_loader.py

diff --git a/ptypy/core/manager.py b/ptypy/core/manager.py
index 3bb7278ef..0d9d7d341 100644
--- a/ptypy/core/manager.py
+++ b/ptypy/core/manager.py
@@ -1645,6 +1645,10 @@ def _from_dict(cls, dct):
     @property
     def data_available(self):
         return any(s.data_available for s in list(self.scans.values()))
+    
+    @property
+    def end_of_scan(self):
+        return all(s.ptyscan.end_of_scan for s in list(self.scans.values()))
 
     def new_data(self):
         """
diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index 926aad919..eab95b33f 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -109,6 +109,12 @@ class Ptycho(Base):
     lowlim = 1
     userlevel = 1
 
+    [min_frames_for_recon]
+    default = 0
+    type = int
+    help = Minimum number of frames to be loaded before reconstruction can start.
+    doc = For on-the-fly (live) processing, the first reconstruction engine will wait until this many frames have been loaded.
+
     [dry_run]
     default = False
     help = Dry run switch
@@ -524,8 +530,9 @@ def init_data(self, print_stats=True):
         # Load the data. This call creates automatically the scan managers,
         # which create the views and the PODs. Sets self.new_data
         with LogTime(self.p.io.benchmark == 'all') as t:
-            self.new_data = self.model.new_data()
-        if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration
+            while not self.new_data:
+                self.new_data = self.model.new_data()
+            if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration
 
         # Print stats
         parallel.barrier()
@@ -677,6 +684,10 @@ def run(self, label=None, epars=None, engine=None):
                         engine.prepare()
                     if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.engine_prepare += t.duration
 
+                # Keep loading data, unless we have reached minimum nr. of frames or end of scan
+                if (len(self.diff.V) < self.p.min_frames_for_recon) and not self.model.end_of_scan:
+                    continue
+
                 auto_save = self.p.io.autosave
                 if auto_save.active and auto_save.interval > 0:
                     if engine.curiter % auto_save.interval == 0:
@@ -686,6 +697,11 @@ def run(self, label=None, epars=None, engine=None):
                         self.runtime.last_save = engine.curiter
                         logger.info(headerline())
 
+                # If not end of scan, expand total number of iterations
+                # This is to make sure that the specified nr. of iterations is guaranteed once all data is loaded
+                if not self.model.end_of_scan:
+                    engine.numiter += engine.p.numiter_contiguous
+
                 # One iteration
                 with LogTime(self.p.io.benchmark == 'all') as t:
                     engine.iterate()
diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py
index 78f1f6e04..a7ffb6cee 100644
--- a/ptypy/engines/base.py
+++ b/ptypy/engines/base.py
@@ -42,6 +42,7 @@ class BaseEngine(object):
     type = int
     lowlim = 1
     help = Total number of iterations
+    doc = For on-the-fly (live) processing, the reconstruction engine will iterate at least this many times after all data has been loaded.
 
     [numiter_contiguous]
     default = 1
diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py
index 2459069c9..80ecc2b87 100644
--- a/ptypy/experiment/hdf5_loader.py
+++ b/ptypy/experiment/hdf5_loader.py
@@ -42,19 +42,6 @@ class Hdf5Loader(PtyScan):
           It is assumed in this latter case that the fast axis in the scan corresponds
           the fast axis on disc (i.e. C-ordered layout).
 
-    [intensities.is_swmr]
-    default = False
-    type = bool
-    help = If True, then intensities are assumed to be a swmr dataset that is being written as processing
-           is taking place.
-
-    [intensities.live_key]
-    default = None
-    type = str
-    help = Key to live keys inside the intensities.file (used only if is_swmr is True)
-    doc = Live_keys indicate where the data collection has progressed to. They are zero at the 
-          scan start, but non-zero when the position is complete.
-
     [intensities.file]
     default = None
     type = str
@@ -79,19 +66,6 @@ class Hdf5Loader(PtyScan):
             * axis_data.shape (C,) for data.shape (C*D, frame_size_m, frame_size_n) where D is the
               size of the other axis.
 
-    [positions.is_swmr]
-    default = False
-    type = bool
-    help = If True, positions are assumed to be a swmr dataset that is being written as processing
-           is taking place.
-
-    [positions.live_key]
-    default = None
-    type = str
-    help = Live_keys indicate where the data collection has progressed to. They are zero at the 
-           scan start, but non-zero when the position is complete. If None whilst positions.is_swmr 
-           is True, use "intensities.live_key".
-
     [positions.file]
     default = None
     type = str
@@ -199,20 +173,6 @@ class Hdf5Loader(PtyScan):
     help = Parameters for per-point normalisation (i.e. ion chamber reading).
     doc = The shape of loaded data is assumed to have the same dimensionality as data.shape[:-2]
 
-    [normalisation.is_swmr]
-    default = False
-    type = bool
-    help = If this is set to be true, then normalisations are assumed to be swmr datasets that are being written as processing
-            is taking place.
-
-    [normalisation.live_key]
-    default = None
-    type = str
-    help = If normalisation.is_swmr is true then we need a live_key to know where the data collection has progressed to.
-            This is the key to these live keys inside the normalisation.file. If None, whilst normalisation.is_swmr is
-            True, then we just assume the same keys work for both normalisation and intensities. They are zero at the
-            scan start, but non-zero when the position is complete.
-
     [normalisation.file]
     default = None
     type = str
@@ -340,7 +300,7 @@ class Hdf5Loader(PtyScan):
           and converted to electron wavelengths.    
     """
 
-    def __init__(self, pars=None, **kwargs):
+    def __init__(self, pars=None, swmr=False, **kwargs):
         """
         hdf5 data loader
         """
@@ -369,8 +329,11 @@ def __init__(self, pars=None, **kwargs):
         self.preview_indices = None
         self.framefilter = None
         self._is_spectro_scan = False
-
+        self._is_swmr = swmr
+        
         self.fhandle_intensities = None
+        self.fhandle_positions_fast = None
+        self.fhandle_positions_slow = None
         self.fhandle_darkfield = None
         self.fhandle_flatfield = None
         self.fhandle_normalisation = None
@@ -417,17 +380,12 @@ def _params_check(self):
                     self.p.positions.fast_key]:
             raise RuntimeError("Missing some information about either the positions or the intensity mapping!")
 
-        if True in [self.p.intensities.is_swmr,
-                    self.p.positions.is_swmr,
-                    self.p.normalisation.is_swmr]:
-            raise NotImplementedError("Currently swmr functionality is not implemented! Coming soon...")
-
     def _spectro_scan_check(self):
         """
         make adjustments if dealing with a spectro scan
         """
         if None not in [self.p.recorded_energy.file, self.p.recorded_energy.key]:
-            with h5.File(self.p.recorded_energy.file, 'r') as f:
+            with h5.File(self.p.recorded_energy.file, 'r', swmr=self._is_swmr) as f:
                 _energy_dset = f[self.p.recorded_energy.key]
                 if len(_energy_dset.shape):
                     if _energy_dset.shape[0] > 1:
@@ -436,32 +394,32 @@ def _spectro_scan_check(self):
             self.p.outer_index = 0
         if self._is_spectro_scan:
             log(3, "This is appears to be a spectro scan, selecting index = {}".format(self.p.outer_index))
+        if self._is_spectro_scan and self._is_swmr:
+            raise RuntimeError("Spectro scans are currently not compatible with SWMR mode")
 
 
     def _prepare_intensity_and_positions(self):
         """
         Prep for loading intensity and position data
         """
-        self.fhandle_intensities = h5.File(self.p.intensities.file, 'r')
+        self.fhandle_intensities = h5.File(self.p.intensities.file, 'r', swmr=self._is_swmr)
         self.intensities = self.fhandle_intensities[self.p.intensities.key]
         self.intensities_dtype = self.intensities.dtype
         self.data_shape = self.intensities.shape
         if self._is_spectro_scan and self.p.outer_index is not None:
             self.data_shape = tuple(np.array(self.data_shape)[1:])
 
-        with h5.File(self.p.positions.file, 'r') as f:
-            fast_axis = f[self.p.positions.fast_key][...]
+        self.fhandle_positions_fast = h5.File(self.p.positions.file, 'r', swmr=self._is_swmr)
+        self.fast_axis = self.fhandle_positions_fast[self.p.positions.fast_key]
         if self._is_spectro_scan and self.p.outer_index is not None:
-            fast_axis = fast_axis[self.p.outer_index]
-        self.fast_axis = np.squeeze(fast_axis) if fast_axis.ndim > 2 else fast_axis
-        self.positions_fast_shape = self.fast_axis.shape
+            self.fast_axis = self.fast_axis[self.p.outer_index]
+        self.positions_fast_shape = np.squeeze(self.fast_axis).shape if self.fast_axis.ndim > 2 else self.fast_axis.shape
 
-        with h5.File(self.p.positions.file, 'r') as f:
-            slow_axis = f[self.p.positions.slow_key][...]
+        self.fhandle_positions_slow = h5.File(self.p.positions.file, 'r', swmr=self._is_swmr)
+        self.slow_axis = self.fhandle_positions_slow[self.p.positions.slow_key]
         if self._is_spectro_scan and self.p.outer_index is not None:
-            slow_axis = slow_axis[self.p.outer_index]
-        self.slow_axis = np.squeeze(slow_axis) if slow_axis.ndim > 2 else slow_axis
-        self.positions_slow_shape = self.slow_axis.shape
+            self.slow_axis = self.slow_axis[self.p.outer_index]
+        self.positions_slow_shape = np.squeeze(self.slow_axis).shape if self.slow_axis.ndim > 2 else self.slow_axis.shape
 
         log(3, "The shape of the \n\tdiffraction intensities is: {}\n\tslow axis data:{}\n\tfast axis data:{}".format(self.data_shape,
                                                                                                                       self.positions_slow_shape,
@@ -475,7 +433,7 @@ def _prepare_framefilter(self):
         Prep for framefilter
         """
         if None not in [self.p.framefilter.file, self.p.framefilter.key]:
-            with h5.File(self.p.framefilter.file, 'r') as f:
+            with h5.File(self.p.framefilter.file, 'r', swmr=self._is_swmr) as f:
                 self.framefilter = f[self.p.framefilter.key][()].squeeze() > 0 # turn into boolean
             if self._is_spectro_scan and self.p.outer_index is not None:
                 self.framefilter = self.framefilter[self.p.outer_index]
@@ -493,7 +451,7 @@ def _prepare_darkfield(self):
         Prep for darkfield
         """
         if None not in [self.p.darkfield.file, self.p.darkfield.key]:
-            self.fhandle_darkfield =  h5.File(self.p.darkfield.file, 'r')
+            self.fhandle_darkfield =  h5.File(self.p.darkfield.file, 'r', swmr=self._is_swmr)
             self.darkfield = self.fhandle_darkfield[self.p.darkfield.key]
             log(3, "The darkfield has shape: {}".format(self.darkfield.shape))
             if self.darkfield.shape == self.data_shape:
@@ -516,7 +474,7 @@ def _prepare_flatfield(self):
         Prep for flatfield
         """
         if None not in [self.p.flatfield.file, self.p.flatfield.key]:
-            self.fhandle_flatfield = h5.File(self.p.flatfield.file, 'r')
+            self.fhandle_flatfield = h5.File(self.p.flatfield.file, 'r', swmr=self._is_swmr)
             self.flatfield = self.fhandle_flatfield[self.p.flatfield.key]
             log(3, "The flatfield has shape: {}".format(self.flatfield.shape))
             if self.flatfield.shape == self.data_shape:
@@ -535,7 +493,7 @@ def _prepare_mask(self):
         Prep for mask
         """
         if None not in [self.p.mask.file, self.p.mask.key]:
-            self.fhandle_mask = h5.File(self.p.mask.file, 'r')
+            self.fhandle_mask = h5.File(self.p.mask.file, 'r', swmr=self._is_swmr)
             self.mask = self.fhandle_mask[self.p.mask.key]
             self.mask_dtype = self.mask.dtype
             log(3, "The mask has shape: {}".format(self.mask.shape))
@@ -557,7 +515,7 @@ def _prepare_normalisation(self):
         Prep for normalisation
         """
         if None not in [self.p.normalisation.file, self.p.normalisation.key]:
-            self.fhandle_normalisation = h5.File(self.p.normalisation.file, 'r')
+            self.fhandle_normalisation = h5.File(self.p.normalisation.file, 'r', swmr=self._is_swmr)
             self.normalisation = self.fhandle_normalisation[self.p.normalisation.key]
             self.normalisation_mean = self.normalisation[:].mean()
             self.normalisation_std  = self.normalisation[:].std()
@@ -577,7 +535,7 @@ def _prepare_meta_info(self):
         Prep for meta info (energy, distance, psize)
         """
         if None not in [self.p.recorded_energy.file, self.p.recorded_energy.key]:
-            with h5.File(self.p.recorded_energy.file, 'r') as f:
+            with h5.File(self.p.recorded_energy.file, 'r', swmr=self._is_swmr) as f:
                 if self._is_spectro_scan and self.p.outer_index is not None:
                     self.p.energy = float(f[self.p.recorded_energy.key][self.p.outer_index])
                 else:
@@ -587,13 +545,13 @@ def _prepare_meta_info(self):
             log(3, "loading energy={} from file".format(self.p.energy))
 
         if None not in [self.p.recorded_distance.file, self.p.recorded_distance.key]:
-            with h5.File(self.p.recorded_distance.file, 'r') as f:
+            with h5.File(self.p.recorded_distance.file, 'r', swmr=self._is_swmr) as f:
                 self.p.distance = float(f[self.p.recorded_distance.key][()] * self.p.recorded_distance.multiplier)
             self.meta.distance = self.p.distance
             log(3, "loading distance={} from file".format(self.p.distance))
         
         if None not in [self.p.recorded_psize.file, self.p.recorded_psize.key]:
-            with h5.File(self.p.recorded_psize.file, 'r') as f:
+            with h5.File(self.p.recorded_psize.file, 'r', swmr=self._is_swmr) as f:
                 self.p.psize = float(f[self.p.recorded_psize.key][()] * self.p.recorded_psize.multiplier)
             self.info.psize = self.p.psize
             log(3, "loading psize={} from file".format(self.p.psize))
@@ -642,13 +600,12 @@ def load_unmapped_raster_scan(self, indices):
         intensities = {}
         positions = {}
         weights = {}
-        sh = self.slow_axis.shape
         for ii in indices:
             slow_idx, fast_idx = self.preview_indices[:, ii]
-            intensity_index = slow_idx * sh[1] + fast_idx
+            intensity_index = slow_idx * self.slow_axis.shape[1] + fast_idx
             weights[ii], intensities[ii] = self.get_corrected_intensities(intensity_index)
-            positions[ii] = np.array([self.slow_axis[slow_idx, fast_idx] * self.p.positions.slow_multiplier,
-                                      self.fast_axis[slow_idx, fast_idx] * self.p.positions.fast_multiplier])
+            positions[ii] = np.array([np.squeeze(self.slow_axis[slow_idx, fast_idx]) * self.p.positions.slow_multiplier,
+                                      np.squeeze(self.fast_axis[slow_idx, fast_idx]) * self.p.positions.fast_multiplier])
         log(3, 'Data loaded successfully.')
         return intensities, positions, weights
 
@@ -658,9 +615,9 @@ def load_mapped_and_raster_scan(self, indices):
         weights = {}
         for jj in indices:
             slow_idx, fast_idx = self.preview_indices[:, jj]
-            weights[jj], intensities[jj] = self.get_corrected_intensities((slow_idx, fast_idx))  # or the other way round???
-            positions[jj] = np.array([self.slow_axis[slow_idx, fast_idx] * self.p.positions.slow_multiplier,
-                                      self.fast_axis[slow_idx, fast_idx] * self.p.positions.fast_multiplier])
+            weights[jj], intensities[jj] = self.get_corrected_intensities((slow_idx, fast_idx))
+            positions[jj] = np.array([np.squeeze(self.slow_axis[slow_idx, fast_idx]) * self.p.positions.slow_multiplier,
+                                      np.squeeze(self.fast_axis[slow_idx, fast_idx]) * self.p.positions.fast_multiplier])
         log(3, 'Data loaded successfully.')
         return intensities, positions, weights
 
@@ -671,9 +628,8 @@ def load_mapped_and_arbitrary_scan(self, indices):
         for ii in indices:
             jj = self.preview_indices[ii]
             weights[ii], intensities[ii] = self.get_corrected_intensities(jj)
-            positions[ii] = np.array([self.slow_axis[jj] * self.p.positions.slow_multiplier,
-                                      self.fast_axis[jj] * self.p.positions.fast_multiplier])
-
+            positions[ii] = np.array([np.squeeze(self.slow_axis[jj]) * self.p.positions.slow_multiplier,
+                                      np.squeeze(self.fast_axis[jj]) * self.p.positions.fast_multiplier])
         log(3, 'Data loaded successfully.')
         return intensities, positions, weights
 
@@ -752,8 +708,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
             log(3, "Everything is wonderful, each diffraction point has a co-ordinate.")
 
             self._ismapped = True
-            slow_axis_bounds = [0, self.slow_axis.shape[0]]
-            fast_axis_bounds = [0, self.fast_axis.shape[-1]]
+            slow_axis_bounds = [0, self.positions_slow_shape[0]]
+            fast_axis_bounds = [0, self.positions_fast_shape[-1]]
 
             set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds
             set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds
@@ -777,6 +733,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
                 if self.framefilter is not None:
                     self.preview_indices = self.preview_indices[:,self.framefilter[indices[1][::skip,::skip], indices[0][::skip,::skip]].flatten()]
                 self.num_frames = len(self.preview_indices[0])
+
             else:
                 if (set_slow_axis_bounds is not None) and (set_fast_axis_bounds is not None):
                     log(3, "Setting slow axis bounds for an arbitrary mapped scan doesn't make sense. "
@@ -799,8 +756,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
             axis_data.shape (C, D) for data.shape (C*D, frame_size_m, frame_size_n) ,
             '''
             log(3, "Positions are raster, but data is a list of frames. Unpacking the data to match the positions...")
-            slow_axis_bounds = [0, self.slow_axis.shape[0]]
-            fast_axis_bounds = [0, self.fast_axis.shape[-1]]
+            slow_axis_bounds = [0, self.positions_slow_shape[0]]
+            fast_axis_bounds = [0, self.positions_fast_shape[-1]]
 
             set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds
             set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds
@@ -830,8 +787,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
                 axis_data.shape (C,) for data.shape (C, D, frame_size_m, frame_size_n) where D is the size of the other axis,
                 '''
                 log(3, "Assuming the axes are 1D and need to be meshed to match the raster style data")
-                slow_axis_bounds = [0, self.slow_axis.shape[0]]
-                fast_axis_bounds = [0, self.fast_axis.shape[0]]
+                slow_axis_bounds = [0, self.positions_slow_shape[0]]
+                fast_axis_bounds = [0, self.positions_fast_shape[0]]
 
                 set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds
                 set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds
@@ -861,8 +818,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
                 cases covered:
                 axis_data.shape (C,) for data.shape (C*D, frame_size_m, frame_size_n) where D is the size of the other axis.
                 '''
-                slow_axis_bounds = [0,self.slow_axis.shape[0]]
-                fast_axis_bounds = [0, self.fast_axis.shape[0]]
+                slow_axis_bounds = [0,self.positions_slow_shape[0]]
+                fast_axis_bounds = [0, self.positions_fast_shape[0]]
 
                 set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds
                 set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds
@@ -890,7 +847,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
             else:
                 raise IOError("I don't know what to do with these positions/data shapes")
         else:
-            raise IOError("I don't know what to do with these positions/data shapes")
+            raise IOError(f"I don't know what to do with these positions/data shapes: {data_shape}, {positions_slow_shape}, {positions_fast_shape}")
 
     def _finalize(self):
         """
@@ -898,10 +855,12 @@ def _finalize(self):
         """
         super()._finalize()
         for h in [self.fhandle_intensities,
-                self.fhandle_darkfield,
-                self.fhandle_flatfield,
-                self.fhandle_normalisation,
-                self.fhandle_mask]:
+                  self.fhandle_positions_fast,
+                  self.fhandle_positions_slow,
+                  self.fhandle_darkfield,
+                  self.fhandle_flatfield,
+                  self.fhandle_normalisation,
+                  self.fhandle_mask]:
             try:
                 h.close()
             except:
diff --git a/ptypy/experiment/swmr_loader.py b/ptypy/experiment/swmr_loader.py
new file mode 100644
index 000000000..e82417ecc
--- /dev/null
+++ b/ptypy/experiment/swmr_loader.py
@@ -0,0 +1,125 @@
+# -*- coding: utf-8 -*-
+"""\
+Scan loading recipe for the Diamond beamlines.
+
+This file is part of the PTYPY package.
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import h5py as h5
+
+from ptypy.experiment import register
+from ptypy.experiment.hdf5_loader import Hdf5Loader
+from ptypy.utils.verbose import log
+
+try:
+    from swmr_tools import KeyFollower
+
+except ImportError:
+    log(3, "The SWMR loader requires swmr_tools to be installed,"
+           " try pip install swmr_tools")
+    raise ImportError
+
+
+@register()
+class SwmrLoader(Hdf5Loader):
+    """
+    This is an attempt to load data from a live SWMR file that is still being written to.
+
+    Defaults:
+
+    [name]
+    default = 'SwmrLoader'
+    type = str
+    help =
+
+    [intensities.live_key]
+    default = None
+    type = str
+    help = Key to live keys inside the intensities file
+    doc = Live_keys indicate where the data collection has progressed to.
+          They are zero at the scan start, but non-zero when the position
+          is complete.
+
+    [positions.live_fast_key]
+    default = None
+    type = str
+    help = Key to live key for fast axis inside the positions file
+    doc = Live_keys indicate where the data collection has progressed to.
+          They are zero at the scan start, but non-zero when the position
+          is complete.
+
+    [positions.live_slow_key]
+    default = None
+    type = str
+    help = Key to live key for slow axis inside the positions file
+    doc = Live_keys indicate where the data collection has progressed to.
+          They are zero at the scan start, but non-zero when the position
+          is complete.
+
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, swmr=True, **kwargs)
+
+    def _params_check(self):
+        super()._params_check()
+
+        # Check if we have been given the live keys
+        if None in [self.p.intensities.live_key,
+                    self.p.positions.live_slow_key,
+                    self.p.positions.live_fast_key]:
+            raise RuntimeError("Missing live keys to intensities or positions")
+
+        # Check that intensities and positions (and their live keys)
+        # are loaded from the same file
+        if self.p.intensities.file != self.p.positions.file:
+            raise RuntimeError("Intensities and positions file should be same")
+        
+    def _prepare_intensity_and_positions(self):
+        super()._prepare_intensity_and_positions()
+        self.kf = KeyFollower((self.fhandle_intensities[self.p.intensities.live_key],
+                               self.fhandle_positions_slow[self.p.positions.live_slow_key],
+                               self.fhandle_positions_fast[self.p.positions.live_fast_key]),
+                               timeout=5)
+        
+    def compute_scan_mapping_and_trajectory(self,*args):
+        super().compute_scan_mapping_and_trajectory(*args)
+        assert isinstance(self.slow_axis, h5.Dataset), "Scantype = {:s} and mapped={:} is not compatible with the SwmrLoader".format(self._scantype, self._ismapped)
+
+    def get_data_chunk(self, *args, **kwargs):
+        self.kf.refresh()
+        self.intensities.refresh()
+        self.slow_axis.refresh()
+        self.fast_axis.refresh()
+        # refreshing here to update before Ptyscan.get_data_chunk calls check and load
+        return super().get_data_chunk(*args, **kwargs)
+
+    def check(self, frames=None, start=None):
+        """
+        Check the live SWMR file for available frames.
+        """
+        if start is None:
+            start = self.framestart
+
+        if frames is None:
+            frames = self.min_frames
+
+        available = min(self.kf.get_current_max() + 1, self.num_frames)
+        new_frames = available - start
+        # not reached expected nr. of frames
+        if new_frames <= frames:
+            # but its last chunk of scan so load it anyway
+            if available == self.num_frames:
+                frames_accessible = new_frames
+                end_of_scan = 1
+            # otherwise, do nothing
+            else:
+                end_of_scan = 0
+                frames_accessible = 0
+        # reached expected nr. of frames
+        else:
+            frames_accessible = frames
+            end_of_scan = 0
+
+        return frames_accessible, end_of_scan

From 20d02ea9baea5d2699fcf3fb387fd7c9202e6e36 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Wed, 1 Mar 2023 13:54:54 +0000
Subject: [PATCH 07/37] interactive plotting: move Ipython dependency into
 jupyter client (#482)

* move Ipython dependency into jupyter client
* added explanation to local ipython import
---
 ptypy/core/ptycho.py       |  3 +--
 ptypy/utils/plot_client.py | 10 ++++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index eab95b33f..f4034e23e 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -722,11 +722,10 @@ def run(self, label=None, epars=None, engine=None):
                     if (self.p.io.autoplot.active) and (not self.p.io.autoplot.threaded):
                         if not (info["iteration"] % self.p.io.autoplot.interval):
                             if self._jupyter_client is None:
-                                from IPython import display
                                 from ptypy.utils.plot_client import _JupyterClient
                                 self._jupyter_client = _JupyterClient(self, autoplot_pars=self.p.io.autoplot, layout_pars=self.p.io.autoplot.layout)
                             self._jupyter_client.runtime.update(self.runtime)
-                            display.display(self._jupyter_client.plot(title=imsg), clear=True)
+                            self._jupyter_client.display(imsg)
                     else:
                         ilog_streamer(imsg)
                     
diff --git a/ptypy/utils/plot_client.py b/ptypy/utils/plot_client.py
index 6f0eb1bc3..e458f6531 100644
--- a/ptypy/utils/plot_client.py
+++ b/ptypy/utils/plot_client.py
@@ -716,6 +716,12 @@ def __init__(self, ptycho, autoplot_pars=None, layout_pars=None):
                                       in_thread=False)
         self.initialized = False
 
+        # not ideal but currently best solution
+        # avoiding a module-level import of Ipython
+        # since its not part of the core dependencies
+        import IPython
+        self.ipython = IPython
+
     def plot(self, title=""):
         if not self.initialized:
             self.update_plot_layout()
@@ -725,6 +731,10 @@ def plot(self, title=""):
         plt.close(self.plot_fig)
         return self.plot_fig
 
+    def display(self,title):
+        self.ipython.display.display(self.plot(title=title), clear=True)
+        
+
 def figure_from_ptycho(P, pars=None):
     """
     Returns a matplotlib figure displaying a reconstruction

From 6ecaaa3bd3ed0325012b717ac3b4909fd234d216 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 22 Jun 2023 16:29:43 +0100
Subject: [PATCH 08/37] Python 3.11 compatibility (#489)

* add python 3.11 and remove MPI from matrix

* replace deprecated np.bool

* more replacmenets for deprecated np.bool and np.float

* more replacements for deprecated np.int

* drop dependency on specific Python version

* unit test cases should return None

* use raw docstring to avoid invalid escape sequence
---
 .github/workflows/test.yml                            | 11 +++--------
 cufft/dependencies.yml                                |  2 +-
 dependencies_core.yml                                 |  2 +-
 dependencies_dev.yml                                  |  2 +-
 dependencies_full.yml                                 |  2 +-
 ptypy/accelerate/base/kernels.py                      |  2 +-
 ptypy/accelerate/cuda_cupy/dependencies.yml           |  2 +-
 ptypy/accelerate/cuda_pycuda/dependencies.yml         |  2 +-
 ptypy/accelerate/ocl_pyopencl/npy_kernels.py          |  2 +-
 .../accelerate/ocl_pyopencl/npy_kernels_for_block.py  |  2 +-
 ptypy/accelerate/ocl_pyopencl/ocl_fft.py              |  2 +-
 ...ocl_kernels_self_contained_for_future_reference.py |  2 +-
 ptypy/core/sample.py                                  |  2 +-
 ptypy/core/xy.py                                      |  2 +-
 ptypy/custom/ePIE_parallel.py                         |  2 +-
 ptypy/engines/Bragg3d_engines.py                      |  2 +-
 ptypy/experiment/cSAXS.py                             |  2 +-
 ptypy/experiment/optiklabor.py                        |  2 +-
 ptypy/simulations/detector.py                         |  8 ++++----
 ptypy/utils/misc.py                                   |  2 +-
 ptypy/utils/scripts.py                                |  4 ++--
 test/template_tests/prep_and_run_moonflower_test.py   |  5 -----
 22 files changed, 27 insertions(+), 37 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b95689973..4ec05b6b6 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -24,16 +24,11 @@ jobs:
       max-parallel: 10
       fail-fast: false
       matrix:
-        python-version: ['3.7','3.8','3.9','3.10']
-        mpi: ['mpich', 'openmpi']
-    name: Testing with ${{ matrix.mpi }} and Python ${{ matrix.python-version }} 
+        python-version: ['3.8','3.9','3.10', '3.11']
+    name: Testing with Python ${{ matrix.python-version }} 
     steps:
     - name: Checkout
       uses: actions/checkout@v3
-    - name: Set up MPI
-      uses: mpi4py/setup-mpi@v1
-      with:
-        mpi: ${{ matrix.mpi }}
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v4
       with:
@@ -47,7 +42,7 @@ jobs:
     - name: Install dependencies
       run: |
         # replace python version in core dependencies
-        sed -i 's/python=3.9/python=${{ matrix.python-version }}/' dependencies_core.yml
+        sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_core.yml
         conda env update --file dependencies_core.yml --name base
         conda list	
     - name: Prepare ptypy
diff --git a/cufft/dependencies.yml b/cufft/dependencies.yml
index 949079d36..48f17a1e7 100644
--- a/cufft/dependencies.yml
+++ b/cufft/dependencies.yml
@@ -2,7 +2,7 @@ name: ptypy_cufft
 channels:
   - conda-forge
 dependencies:
-  - python=3.9
+  - python
   - cmake>=3.8.0
   - pybind11
   - compilers
diff --git a/dependencies_core.yml b/dependencies_core.yml
index c31949627..5f0b7c13f 100644
--- a/dependencies_core.yml
+++ b/dependencies_core.yml
@@ -1,6 +1,6 @@
 name: ptypy_core
 dependencies:
-  - python=3.9
+  - python
   - numpy
   - scipy
   - h5py
diff --git a/dependencies_dev.yml b/dependencies_dev.yml
index 230a7e190..5462c145a 100644
--- a/dependencies_dev.yml
+++ b/dependencies_dev.yml
@@ -2,7 +2,7 @@ name: ptypy_full
 channels:
   - conda-forge
 dependencies:
-  - python=3.9
+  - python
   - numpy
   - scipy
   - matplotlib
diff --git a/dependencies_full.yml b/dependencies_full.yml
index 65a1774f6..e43241fce 100644
--- a/dependencies_full.yml
+++ b/dependencies_full.yml
@@ -2,7 +2,7 @@ name: ptypy_full
 channels:
   - conda-forge
 dependencies:
-  - python=3.9
+  - python
   - numpy
   - scipy
   - matplotlib
diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py
index f3a13bad5..af1b65b11 100644
--- a/ptypy/accelerate/base/kernels.py
+++ b/ptypy/accelerate/base/kernels.py
@@ -109,7 +109,7 @@ def error_reduce(self, addr, err_sum):
         ## Actual math ##
 
         # Reduces the Fourier error along the last 2 dimensions.fd
-        #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float)
+        #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float)
         err_sum[:] = ferr.sum(-1).sum(-1)
         return
 
diff --git a/ptypy/accelerate/cuda_cupy/dependencies.yml b/ptypy/accelerate/cuda_cupy/dependencies.yml
index cb7d31fce..6331bbbc5 100644
--- a/ptypy/accelerate/cuda_cupy/dependencies.yml
+++ b/ptypy/accelerate/cuda_cupy/dependencies.yml
@@ -2,7 +2,7 @@ name: ptypy_cupy
 channels:
   - conda-forge
 dependencies:
-  - python=3.9
+  - python
   - numpy
   - scipy
   - matplotlib
diff --git a/ptypy/accelerate/cuda_pycuda/dependencies.yml b/ptypy/accelerate/cuda_pycuda/dependencies.yml
index d8b9dfad9..455d60479 100644
--- a/ptypy/accelerate/cuda_pycuda/dependencies.yml
+++ b/ptypy/accelerate/cuda_pycuda/dependencies.yml
@@ -2,7 +2,7 @@ name: ptypy_pycuda
 channels:
   - conda-forge
 dependencies:
-  - python=3.9
+  - python
   - numpy
   - scipy
   - matplotlib
diff --git a/ptypy/accelerate/ocl_pyopencl/npy_kernels.py b/ptypy/accelerate/ocl_pyopencl/npy_kernels.py
index 3c87978ae..8f09e94d0 100644
--- a/ptypy/accelerate/ocl_pyopencl/npy_kernels.py
+++ b/ptypy/accelerate/ocl_pyopencl/npy_kernels.py
@@ -102,7 +102,7 @@ def error_reduce(self, g_err_sum, offset=0):
         ## Actual math ##
 
         # Reduceses the Fourier error along the last 2 dimensions.fd
-        error_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float)
+        error_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float)
 
     def fmag_all_update(self, pbound, g_mag, g_mask, g_err_sum, offset=0):
 
diff --git a/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py b/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py
index 85c01d4be..b8a284492 100644
--- a/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py
+++ b/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py
@@ -87,7 +87,7 @@ def error_reduce(self, addr, err_sum):
         ## Actual math ##
 
         # Reduceses the Fourier error along the last 2 dimensions.fd
-        #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float)
+        #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float)
         err_sum[:] = ferr.sum(-1).sum(-1)
         return
 
diff --git a/ptypy/accelerate/ocl_pyopencl/ocl_fft.py b/ptypy/accelerate/ocl_pyopencl/ocl_fft.py
index 26e08c298..4cfa33905 100644
--- a/ptypy/accelerate/ocl_pyopencl/ocl_fft.py
+++ b/ptypy/accelerate/ocl_pyopencl/ocl_fft.py
@@ -174,7 +174,7 @@ def __init__(self, queue, array,
 
         # attach scaling
         from reikna.transformations import mul_param
-        sc = mul_param(array, np.float)
+        sc = mul_param(array, float)
         ftreikna.parameter.output.connect(sc, sc.input, out=sc.output, scale=sc.param)
         iscale = np.sqrt(np.prod(array.shape[-2:])) if symmetric else 1.0
         scale = 1.0 / iscale
diff --git a/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py b/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py
index 9d4b5d88a..20793ea94 100644
--- a/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py
+++ b/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py
@@ -253,7 +253,7 @@ def ocl_fourier_error(self, f, fmag, fdev, ferr, fmask, mask_sum):
         self.queue.finish()
 
     def npy_error_reduce(self, ferr, err_fmag):
-        err_fmag[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float)
+        err_fmag[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float)
 
     def ocl_error_reduce(self, ferr, err_fmag):
         shape = (self.fshape[0], 64),
diff --git a/ptypy/core/sample.py b/ptypy/core/sample.py
index 6f5702424..bdb14b4f3 100644
--- a/ptypy/core/sample.py
+++ b/ptypy/core/sample.py
@@ -361,7 +361,7 @@ def simulate(A, pars, energy, fill=1.0, prefix="", **kwargs):
         logger.info(prefix +
                     "Simulation resource is a thickness profile")
         # Enforce floats
-        ob = obj.astype(np.float)
+        ob = obj.astype(float)
         ob -= ob.min()
         if d is not None:
             logger.info(prefix + "Rescaling to maximum thickness")
diff --git a/ptypy/core/xy.py b/ptypy/core/xy.py
index f5c2e6f8f..96d35cdaa 100644
--- a/ptypy/core/xy.py
+++ b/ptypy/core/xy.py
@@ -152,7 +152,7 @@ def _complete(extent, steps, spacing):
     elif steps is None:
         e = u.expect2(extent)
         s = u.expect2(spacing)
-        l = (e / s).astype(np.int)
+        l = (e / s).astype(int)
     elif spacing is None:
         e = u.expect2(extent)
         l = u.expect2(steps)
diff --git a/ptypy/custom/ePIE_parallel.py b/ptypy/custom/ePIE_parallel.py
index f92b1b9d0..a16e909db 100644
--- a/ptypy/custom/ePIE_parallel.py
+++ b/ptypy/custom/ePIE_parallel.py
@@ -179,7 +179,7 @@ def engine_prepare(self):
             if pod.active:
                 self.ob_nodecover[pod.ob_view] = 1
         self.nodemask = np.array(list(self.ob_nodecover.S.values())[0].data[0],
-                                 dtype=np.bool)
+                                 dtype=bool)
 
         # communicate this over MPI
         parallel.allreduceC(self.ob_nodecover)
diff --git a/ptypy/engines/Bragg3d_engines.py b/ptypy/engines/Bragg3d_engines.py
index 6e7cb3407..a12be75b6 100644
--- a/ptypy/engines/Bragg3d_engines.py
+++ b/ptypy/engines/Bragg3d_engines.py
@@ -167,7 +167,7 @@ def object_update(self):
                 r = np.sqrt((x_ - xcenter)**2 + (y_ - ycenter)**2)
                 scaling = np.min(geo.resolution)
                 r /= scaling
-                r = r.astype(np.int)
+                r = r.astype(int)
                 tbin = np.bincount(r.ravel(), arr.ravel())
                 nr = np.bincount(r.ravel())
                 s = np.arange(len(tbin)) * scaling
diff --git a/ptypy/experiment/cSAXS.py b/ptypy/experiment/cSAXS.py
index 236dba49d..59ef3689b 100644
--- a/ptypy/experiment/cSAXS.py
+++ b/ptypy/experiment/cSAXS.py
@@ -100,7 +100,7 @@ def check(self, frames, start):
     def load(self, indices):
         raw = {}
         for i in indices:
-            raw[i] = self.data_object.getframe(i).data.astype(np.float)
+            raw[i] = self.data_object.getframe(i).data.astype(float)
         return raw, {}, {}
 
 
diff --git a/ptypy/experiment/optiklabor.py b/ptypy/experiment/optiklabor.py
index 96e757067..81ed97fb6 100644
--- a/ptypy/experiment/optiklabor.py
+++ b/ptypy/experiment/optiklabor.py
@@ -134,7 +134,7 @@ def load_common(self):
         exposures =[]
         for j in range(self.nexp):
             darks,meta = u.image_read(self.info.dark_dir + '/ccd*_%02d.raw' % j)
-            dark_imgs.append(np.array(darks,dtype=np.float).mean(0))
+            dark_imgs.append(np.array(darks,dtype=float).mean(0))
             exposures.append(meta[0][self.exp_string])
 
         # save in common dict/Param
diff --git a/ptypy/simulations/detector.py b/ptypy/simulations/detector.py
index 7b2fb6254..ff3ff3ad6 100644
--- a/ptypy/simulations/detector.py
+++ b/ptypy/simulations/detector.py
@@ -109,20 +109,20 @@ def _update(self,pars=None):
 
     def _make_mask(self):
         gaps = expect2(self.gaps)
-        module = np.ones(self.shape).astype(np.bool)
+        module = np.ones(self.shape).astype(bool)
         start = module.copy()
         for i in range(self.modules[0]-1):
-            gap = np.zeros((gaps[0],module.shape[1])).astype(np.bool)
+            gap = np.zeros((gaps[0],module.shape[1])).astype(bool)
             start = np.concatenate([start,np.concatenate([gap,module],axis=0)],axis=0)
         module = start.copy()
         for i in range(self.modules[1]-1):
-            gap = np.zeros((module.shape[0],gaps[1])).astype(np.bool)
+            gap = np.zeros((module.shape[0],gaps[1])).astype(bool)
             start = np.concatenate([start,np.concatenate([gap,module],axis=1)],axis=1)
         self._mask = start
 
     def _get_mask(self,sh):
         msh =  expect2(sh[-2:])
-        mask = np.zeros(msh).astype(np.bool)
+        mask = np.zeros(msh).astype(bool)
         offset = msh//2 - expect2(self.center)
         mask = fill2D(mask,self._mask,-offset)
         return np.resize(mask,sh)
diff --git a/ptypy/utils/misc.py b/ptypy/utils/misc.py
index 5a352b329..75dd377c4 100644
--- a/ptypy/utils/misc.py
+++ b/ptypy/utils/misc.py
@@ -341,7 +341,7 @@ def clean_path(filename):
 
 
 def electron_wavelength(electron_energy):
-    """
+    r"""
     Calculate electron wavelength based on energy in keV:
 
     .. math::
diff --git a/ptypy/utils/scripts.py b/ptypy/utils/scripts.py
index c49cb269d..97e3248b3 100644
--- a/ptypy/utils/scripts.py
+++ b/ptypy/utils/scripts.py
@@ -147,7 +147,7 @@ def hdr_image(img_list, exp_list, thresholds=[3000,50000], dark_list=[],
     img_list = [img.astype(float) for img in img_list]
     dark_list = [dark.astype(float) for dark in dark_list]
     exp_list = [float(exp) for exp in exp_list]
-    mask_list = [mask.astype(np.int) for mask in mask_list]
+    mask_list = [mask.astype(int) for mask in mask_list]
 
     for img, dark, exp,mask in zip(img_list, dark_list,exp_list,mask_list):
         img[:] = abs(img - dark)
@@ -177,7 +177,7 @@ def hdr_image(img_list, exp_list, thresholds=[3000,50000], dark_list=[],
                                                  ix[j]][themask.astype(bool)]
                                              * max_exp/exp_list[ix[j]])
     else:
-        mask_sum = np.zeros_like(mask_list[0]).astype(np.int)
+        mask_sum = np.zeros_like(mask_list[0]).astype(int)
         img_hdr = np.zeros_like(img_list[0])
         for img, exp, mask in zip(img_list,exp_list,mask_list):
             img = img * max_exp/exp
diff --git a/test/template_tests/prep_and_run_moonflower_test.py b/test/template_tests/prep_and_run_moonflower_test.py
index 114ec35ee..8f2a825b2 100644
--- a/test/template_tests/prep_and_run_moonflower_test.py
+++ b/test/template_tests/prep_and_run_moonflower_test.py
@@ -33,7 +33,6 @@ def test_dm_single_probe(self):
         p.engines.engine00.name = 'DM'
         p.engines.engine00.numiter = 5
         P = Ptycho(p,level=5)
-        return P
 
     def test_dm_multiple_probes(self):
         p = u.Param()
@@ -67,7 +66,6 @@ def test_dm_multiple_probes(self):
         p.engines.engine00.numiter = 5
         p.engines.engine00.fourier_relax_factor = 0.05
         P = Ptycho(p,level=5)
-        return P
 
     def test_dm_resample(self):
         p = u.Param()
@@ -97,7 +95,6 @@ def test_dm_resample(self):
         p.engines.engine00.name = 'DM'
         p.engines.engine00.numiter = 5
         P = Ptycho(p,level=5)
-        return P
 
     def test_ml_single_probe(self):
         p = u.Param()
@@ -134,7 +131,6 @@ def test_ml_single_probe(self):
         p.engines.engine00.floating_intensities = False
         p.engines.engine00.numiter = 5
         P = Ptycho(p,level=5)
-        return P
 
     def test_ml_resample(self):
         p = u.Param()
@@ -172,7 +168,6 @@ def test_ml_resample(self):
         p.engines.engine00.floating_intensities = False
         p.engines.engine00.numiter = 5
         P = Ptycho(p,level=5)
-        return P
 
 if __name__ == '__main__':
     unittest.main()

From fd52d7c3b22489e445cdf5bc583a054b8a52c827 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Wed, 19 Jul 2023 12:07:05 +0100
Subject: [PATCH 09/37] Changes in numpy 1.25 (#492)

* check for longdouble

* convert size-1 numpy arrays to scalars

* replaced np.product with np.prod
---
 archive/cuda_extension/python/gpu_extension.pyx |  2 +-
 benchmark/mpi_allreduce_speed.py                |  4 ++--
 ptypy/accelerate/cuda_cupy/array_utils.py       | 12 ++++++------
 ptypy/accelerate/cuda_cupy/cufft.py             |  2 +-
 ptypy/accelerate/cuda_pycuda/array_utils.py     | 12 ++++++------
 ptypy/accelerate/cuda_pycuda/cufft.py           |  6 +++---
 ptypy/experiment/diamond_nexus.py               |  6 +++---
 ptypy/experiment/hdf5_loader.py                 |  6 +++---
 ptypy/utils/descriptor.py                       |  3 ++-
 9 files changed, 27 insertions(+), 26 deletions(-)

diff --git a/archive/cuda_extension/python/gpu_extension.pyx b/archive/cuda_extension/python/gpu_extension.pyx
index aa0b36402..f0db75587 100644
--- a/archive/cuda_extension/python/gpu_extension.pyx
+++ b/archive/cuda_extension/python/gpu_extension.pyx
@@ -153,7 +153,7 @@ def abs2(input):
     cdef np.float32_t [:,:,::1] cout_3c
     cdef np.float64_t [:,::1] cout_d2c
     cdef np.float64_t [:,:,::1] cout_d3c
-    cdef int n = np.product(cin.shape)
+    cdef int n = np.prod(cin.shape)
 
     cdef np.float32_t [:, ::1] cin_f2c
     cdef np.complex64_t [:, ::1] cin_c2c
diff --git a/benchmark/mpi_allreduce_speed.py b/benchmark/mpi_allreduce_speed.py
index 5102e35af..2e562d944 100644
--- a/benchmark/mpi_allreduce_speed.py
+++ b/benchmark/mpi_allreduce_speed.py
@@ -11,7 +11,7 @@
 }
 
 def run_benchmark(shape):
-    megabytes = np.product(shape) * 8 / 1024 / 1024 * 2
+    megabytes = np.prod(shape) * 8 / 1024 / 1024 * 2
 
     data = np.zeros(shape, dtype=np.complex64)
     
@@ -39,4 +39,4 @@ def run_benchmark(shape):
     print('Final results for {} processes'.format(parallel.size))
     print(','.join(['Name', 'Duration', 'MB', 'MB/s']))
     for r in res:
-        print(','.join([str(x) for x in r]))
\ No newline at end of file
+        print(','.join([str(x) for x in r]))
diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py
index 911c6111d..9c68d9431 100644
--- a/ptypy/accelerate/cuda_cupy/array_utils.py
+++ b/ptypy/accelerate/cuda_cupy/array_utils.py
@@ -279,15 +279,15 @@ def delxf(self, input, out, axis=-1):
             self.queue.use()
 
         if axis == input.ndim - 1:
-            flat_dim = np.int32(np.product(input.shape[0:-1]))
+            flat_dim = np.int32(np.prod(input.shape[0:-1]))
             self.delxf_last((
                 int((flat_dim +
                      self.last_axis_block[1] - 1) // self.last_axis_block[1]),
                 1, 1),
                 self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis])))
         else:
-            lower_dim = np.int32(np.product(input.shape[(axis+1):]))
-            higher_dim = np.int32(np.product(input.shape[:axis]))
+            lower_dim = np.int32(np.prod(input.shape[(axis+1):]))
+            higher_dim = np.int32(np.prod(input.shape[:axis]))
             gx = int(
                 (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0])
             gy = 1
@@ -306,14 +306,14 @@ def delxb(self, input, out, axis=-1):
         if self.queue is not None:
             self.queue.use()
         if axis == input.ndim - 1:
-            flat_dim = np.int32(np.product(input.shape[0:-1]))
+            flat_dim = np.int32(np.prod(input.shape[0:-1]))
             self.delxb_last((
                 int((flat_dim +
                      self.last_axis_block[1] - 1) // self.last_axis_block[1]),
                 1, 1), self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis])))
         else:
-            lower_dim = np.int32(np.product(input.shape[(axis+1):]))
-            higher_dim = np.int32(np.product(input.shape[:axis]))
+            lower_dim = np.int32(np.prod(input.shape[(axis+1):]))
+            higher_dim = np.int32(np.prod(input.shape[:axis]))
             gx = int(
                 (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0])
             gy = 1
diff --git a/ptypy/accelerate/cuda_cupy/cufft.py b/ptypy/accelerate/cuda_cupy/cufft.py
index 794efb858..707aba2f7 100644
--- a/ptypy/accelerate/cuda_cupy/cufft.py
+++ b/ptypy/accelerate/cuda_cupy/cufft.py
@@ -23,7 +23,7 @@ def __init__(self, array, queue=None,
         if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
             raise ValueError(
                 "CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
-        self.batches = int(np.product(
+        self.batches = int(np.prod(
             array.shape[0:dims-2]) if dims > 2 else 1)
         self.forward = forward
 
diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py
index 2abd02ba4..72eae996f 100644
--- a/ptypy/accelerate/cuda_pycuda/array_utils.py
+++ b/ptypy/accelerate/cuda_pycuda/array_utils.py
@@ -270,7 +270,7 @@ def delxf(self, input, out, axis=-1):
         axis = np.int32(axis)
 
         if axis == input.ndim - 1:
-            flat_dim = np.int32(np.product(input.shape[0:-1]))
+            flat_dim = np.int32(np.prod(input.shape[0:-1]))
             self.delxf_last(input, out, flat_dim, np.int32(input.shape[axis]),
                             block=self.last_axis_block,
                             grid=(
@@ -280,8 +280,8 @@ def delxf(self, input, out, axis=-1):
                 stream=self.queue
             )
         else:
-            lower_dim = np.int32(np.product(input.shape[(axis+1):]))
-            higher_dim = np.int32(np.product(input.shape[:axis]))
+            lower_dim = np.int32(np.prod(input.shape[(axis+1):]))
+            higher_dim = np.int32(np.prod(input.shape[:axis]))
             gx = int(
                 (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0])
             gy = 1
@@ -301,7 +301,7 @@ def delxb(self, input, out, axis=-1):
         axis = np.int32(axis)
 
         if axis == input.ndim - 1:
-            flat_dim = np.int32(np.product(input.shape[0:-1]))
+            flat_dim = np.int32(np.prod(input.shape[0:-1]))
             self.delxb_last(input, out, flat_dim, np.int32(input.shape[axis]),
                             block=self.last_axis_block,
                             grid=(
@@ -311,8 +311,8 @@ def delxb(self, input, out, axis=-1):
                 stream=self.queue
             )
         else:
-            lower_dim = np.int32(np.product(input.shape[(axis+1):]))
-            higher_dim = np.int32(np.product(input.shape[:axis]))
+            lower_dim = np.int32(np.prod(input.shape[(axis+1):]))
+            higher_dim = np.int32(np.prod(input.shape[:axis]))
             gx = int(
                 (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0])
             gy = 1
diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py
index d10e82b1a..4859b36b2 100644
--- a/ptypy/accelerate/cuda_pycuda/cufft.py
+++ b/ptypy/accelerate/cuda_pycuda/cufft.py
@@ -21,7 +21,7 @@ def __init__(self, array, queue=None,
         columns = self.arr_shape[1]
         if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
             raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
-        self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1)
+        self.batches = int(np.prod(array.shape[0:dims-2]) if dims > 2 else 1)
         self.forward = forward
 
         self._load(array, pre_fft, post_fft, symmetric, forward)
@@ -121,11 +121,11 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward):
         )
         # with cuFFT, we need to scale ifft
         if not symmetric and not forward:
-            self.scale = 1 / np.product(self.arr_shape)
+            self.scale = 1 / np.prod(self.arr_shape)
         elif forward and not symmetric:
             self.scale = 1.0
         else:
-            self.scale = 1 / np.sqrt(np.product(self.arr_shape))
+            self.scale = 1 / np.sqrt(np.prod(self.arr_shape))
 
         if pre_fft is not None:
             self.pre_fft = gpuarray.to_gpu(pre_fft)
diff --git a/ptypy/experiment/diamond_nexus.py b/ptypy/experiment/diamond_nexus.py
index 318887756..5b2f534c1 100644
--- a/ptypy/experiment/diamond_nexus.py
+++ b/ptypy/experiment/diamond_nexus.py
@@ -214,18 +214,18 @@ def __init__(self, pars=None, **kwargs):
 
 
         if None not in [INPUT_FILE, ENERGY_KEY]:
-            self.p.energy = float(h5.File(INPUT_FILE, 'r')[ENERGY_KEY][()] * self.ENERGY_MULTIPLIER)
+            self.p.energy = float(h5.File(INPUT_FILE, 'r')[ENERGY_KEY][()].item() * self.ENERGY_MULTIPLIER)
             self.meta.energy  = self.p.energy
             log(3, "loading energy={} from file".format(self.p.energy))
 
 
         if None not in [INPUT_FILE, DISTANCE_KEY]:
-            self.p.distance = h5.File(INPUT_FILE, 'r')[DISTANCE_KEY][()]
+            self.p.distance = h5.File(INPUT_FILE, 'r')[DISTANCE_KEY][()].item()
             self.meta.distance = self.p.distance
             log(3, "loading distance={} from file".format(self.p.distance))
         
         if None not in [INPUT_FILE, PIXEL_SIZE_KEY]:
-            self.p.psize = h5.File(INPUT_FILE, 'r')[PIXEL_SIZE_KEY][()]
+            self.p.psize = h5.File(INPUT_FILE, 'r')[PIXEL_SIZE_KEY][()].item()
             self.meta.psize = self.p.psize
             log(3, "loading psize={} from file".format(self.p.psize))
 
diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py
index 80ecc2b87..c72949d70 100644
--- a/ptypy/experiment/hdf5_loader.py
+++ b/ptypy/experiment/hdf5_loader.py
@@ -539,20 +539,20 @@ def _prepare_meta_info(self):
                 if self._is_spectro_scan and self.p.outer_index is not None:
                     self.p.energy = float(f[self.p.recorded_energy.key][self.p.outer_index])
                 else:
-                    self.p.energy = float(f[self.p.recorded_energy.key][()])
+                    self.p.energy = float(f[self.p.recorded_energy.key][()].item())
             self.p.energy = self.p.energy * self.p.recorded_energy.multiplier + self.p.recorded_energy.offset
             self.meta.energy  = self.p.energy
             log(3, "loading energy={} from file".format(self.p.energy))
 
         if None not in [self.p.recorded_distance.file, self.p.recorded_distance.key]:
             with h5.File(self.p.recorded_distance.file, 'r', swmr=self._is_swmr) as f:
-                self.p.distance = float(f[self.p.recorded_distance.key][()] * self.p.recorded_distance.multiplier)
+                self.p.distance = float(f[self.p.recorded_distance.key][()].item() * self.p.recorded_distance.multiplier)
             self.meta.distance = self.p.distance
             log(3, "loading distance={} from file".format(self.p.distance))
         
         if None not in [self.p.recorded_psize.file, self.p.recorded_psize.key]:
             with h5.File(self.p.recorded_psize.file, 'r', swmr=self._is_swmr) as f:
-                self.p.psize = float(f[self.p.recorded_psize.key][()] * self.p.recorded_psize.multiplier)
+                self.p.psize = float(f[self.p.recorded_psize.key][()].item() * self.p.recorded_psize.multiplier)
             self.info.psize = self.p.psize
             log(3, "loading psize={} from file".format(self.p.psize))
 
diff --git a/ptypy/utils/descriptor.py b/ptypy/utils/descriptor.py
index 554dc65b2..714a581dd 100644
--- a/ptypy/utils/descriptor.py
+++ b/ptypy/utils/descriptor.py
@@ -853,7 +853,8 @@ def _walk(self, depth=0, pars=None, ignore_symlinks=False, ignore_wildcards=Fals
                 (type(pars).__name__ == 'tuple' and 'list' in self.type) or \
                 (type(pars).__name__ == 'list' and 'tuple' in self.type) or \
                 (type(pars).__name__ == 'int' and 'float' in self.type) or \
-                (type(pars).__name__[:5] == 'float' and 'float' in self.type):
+                (type(pars).__name__[:5] == 'float' and 'float' in self.type) or \
+                (type(pars).__name__ == 'longdouble' and 'float' in self.type):
             yield {'d': self, 'path': path, 'status': 'ok', 'info': ''}
         else:
             yield {'d': self, 'path': path, 'status': 'wrongtype', 'info': type(pars).__name__}

From cc0932baae9daca254122d1d3979a350178d005c Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Wed, 19 Jul 2023 13:22:51 +0100
Subject: [PATCH 10/37] Properly clean up accelerated ML engines to allow
 chaining (#491)

---
 ptypy/accelerate/base/engines/ML_serial.py   |  1 +
 templates/engines/moonflower_ML_ML.py        | 71 +++++++++++++++++++
 templates/engines/moonflower_ML_ML_pycuda.py | 72 ++++++++++++++++++++
 3 files changed, 144 insertions(+)
 create mode 100644 templates/engines/moonflower_ML_ML.py
 create mode 100644 templates/engines/moonflower_ML_ML_pycuda.py

diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py
index 248110326..38f63f385 100644
--- a/ptypy/accelerate/base/engines/ML_serial.py
+++ b/ptypy/accelerate/base/engines/ML_serial.py
@@ -348,6 +348,7 @@ def engine_finalize(self):
             prep = self.diff_info[d.ID]
             float_intens_coeff[label] = prep.float_intens_coeff
         self.ptycho.runtime["float_intens"] = parallel.gather_dict(float_intens_coeff)
+        super().engine_finalize()
 
 
 class BaseModelSerial(BaseModel):
diff --git a/templates/engines/moonflower_ML_ML.py b/templates/engines/moonflower_ML_ML.py
new file mode 100644
index 000000000..fed34ca6a
--- /dev/null
+++ b/templates/engines/moonflower_ML_ML.py
@@ -0,0 +1,71 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+#import ptypy
+from ptypy.core import Ptycho
+from ptypy import utils as u
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+
+# saving intermediate results
+p.io.autosave = u.Param(active=False)
+
+# opens plotting GUI if interaction set to active)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 100 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML'
+p.engines.engine00.ML_type = 'Gaussian'
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+#p.engines.engine00.scale_probe_object = 1.
+p.engines.engine00.smooth_gradient = 20.
+p.engines.engine00.smooth_gradient_decay = 1/50.
+p.engines.engine00.floating_intensities = False
+p.engines.engine00.numiter = 300
+
+p.engines.engine01 = u.Param()
+p.engines.engine01.name = 'ML'
+p.engines.engine01.numiter = 20
+p.engines.engine01.numiter_contiguous = 5
+p.engines.engine01.reg_del2 = False
+p.engines.engine01.reg_del2_amplitude = 1. 
+p.engines.engine01.floating_intensities = False
+p.engines.engine01.probe_support = 0.5
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/moonflower_ML_ML_pycuda.py b/templates/engines/moonflower_ML_ML_pycuda.py
new file mode 100644
index 000000000..d506ec5ae
--- /dev/null
+++ b/templates/engines/moonflower_ML_ML_pycuda.py
@@ -0,0 +1,72 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cuda")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_pycuda'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+p.engines.engine00.smooth_gradient = 20.
+p.engines.engine00.smooth_gradient_decay = 1/50.
+p.engines.engine00.floating_intensities = False
+
+p.engines.engine01 = u.Param()
+p.engines.engine01.name = 'ML_pycuda'
+p.engines.engine01.numiter = 20
+p.engines.engine01.numiter_contiguous = 5
+p.engines.engine01.reg_del2 = False
+p.engines.engine01.reg_del2_amplitude = 1.
+p.engines.engine01.floating_intensities = False
+p.engines.engine01.probe_support = 0.5
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)

From 6cad7cee7f11f842f47dd702587d0144fdcffb32 Mon Sep 17 00:00:00 2001
From: Jari <jfowkes@users.noreply.github.com>
Date: Wed, 19 Jul 2023 16:12:47 +0100
Subject: [PATCH 11/37] Add Euclidean noise model for ML (#486)

---
 ptypy/engines/ML.py                       | 178 +++++++++++++++++++++-
 templates/engines/moonflower_ML_Euclid.py |  62 ++++++++
 2 files changed, 234 insertions(+), 6 deletions(-)
 create mode 100644 templates/engines/moonflower_ML_Euclid.py

diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py
index fe08995ce..e7492b42f 100644
--- a/ptypy/engines/ML.py
+++ b/ptypy/engines/ML.py
@@ -45,7 +45,7 @@ class ML(PositionCorrectionEngine):
     type = str
     help = Likelihood model
     choices = ['gaussian','poisson','euclid']
-    doc = One of ‘gaussian’, poisson’ or ‘euclid’. Only 'gaussian' is implemented.
+    doc = One of ‘gaussian’, poisson’ or ‘euclid’.
 
     [floating_intensities]
     default = False
@@ -99,7 +99,7 @@ class ML(PositionCorrectionEngine):
     type = int
     lowlim = 0
     help = Number of iterations before probe update starts
-    
+
     """
 
     SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull]
@@ -153,7 +153,7 @@ def __init__(self, ptycho_parent, pars=None):
     def engine_initialize(self):
         """
         Prepare for ML reconstruction.
-        """        
+        """
         super(ML, self).engine_initialize()
 
         # Object gradient and minimization direction
@@ -182,7 +182,7 @@ def _initialize_model(self):
         elif self.p.ML_type.lower() == "poisson":
             self.ML_model = PoissonModel(self)
         elif self.p.ML_type.lower() == "euclid":
-            raise NotImplementedError('Euclid norm model not yet implemented')
+            self.ML_model = EuclidModel(self)
         else:
             raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type)
 
@@ -235,7 +235,7 @@ def engine_iterate(self, num=1):
                 cn2_new_pr_grad = Cnorm2(new_pr_grad)
                 cn2_new_ob_grad = Cnorm2(new_ob_grad)
                 if cn2_new_pr_grad > 1e-5:
-                    scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad 
+                    scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad
                                  / cn2_new_pr_grad)
                 else:
                     scale_p_o = self.p.scale_probe_object
@@ -601,7 +601,7 @@ class PoissonModel(BaseModel):
 
     def __init__(self, MLengine):
         """
-        Core functions for ML computation using a Gaussian model.
+        Core functions for ML computation using a Poisson model.
         """
         BaseModel.__init__(self, MLengine)
         from scipy import special
@@ -745,6 +745,172 @@ def poly_line_coeffs(self, ob_h, pr_h):
         return B
 
 
+class EuclidModel(BaseModel):
+    """
+    Euclidean (Amplitude) noise model.
+    TODO: feed actual statistical weights instead of using a fixed variance.
+    """
+
+    def __init__(self, MLengine):
+        """
+        Core functions for ML computation using a Euclidean model.
+        """
+        BaseModel.__init__(self, MLengine)
+
+        # Euclidean model requires weights
+        # TODO: update this part of the code once actual weights are passed in the PODs
+        self.weights = self.engine.di.copy(self.engine.di.ID + '_weights')
+        # FIXME: This part needs to be updated once statistical weights are properly
+        # supported in the data preparation.
+        for name, di_view in self.di.views.items():
+            if not di_view.active:
+                continue
+            self.weights[di_view] = di_view.pod.ma_view.data # just the mask for now
+            #self.weights[di_view] = (di_view.pod.ma_view.data
+            #                         / (1. + stat_weights/di_view.data))
+
+    def __del__(self):
+        """
+        Clean up routine
+        """
+        BaseModel.__del__(self)
+        del self.engine.ptycho.containers[self.weights.ID]
+        del self.weights
+
+    def new_grad(self):
+        """
+        Compute a new gradient direction according to a Euclidean noise model.
+
+        Note: The negative log-likelihood and local errors are also computed
+        here.
+        """
+        self.ob_grad.fill(0.)
+        self.pr_grad.fill(0.)
+
+        # We need an array for MPI
+        LL = np.array([0.])
+        error_dct = {}
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and amplitudes for this view
+            w = self.weights[diff_view]
+            A = np.sqrt(diff_view.data)
+
+            Amodel = np.zeros_like(A)
+            f = {}
+
+            # First pod loop: compute total amplitude
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f[name] = pod.fw(pod.probe * pod.object)
+                Amodel += np.sqrt(u.abs2(f[name]))
+
+            # Floating intensity option
+            if self.p.floating_intensities:
+                self.float_intens_coeff[dname] = A.sum() / Amodel.sum()
+                Amodel *= self.float_intens_coeff[dname]
+
+            Amodel += 1e-6 # cf Poisson model
+            DA = (1. - A / Amodel)
+
+            # Second pod loop: gradients computation
+            LLL = np.sum((w * (Amodel - A)**2).astype(np.float64))
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                xi = pod.bw(w*DA * f[name])
+                self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj()
+                self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj()
+
+            diff_view.error = LLL
+            error_dct[dname] = np.array([0, LLL / np.prod(DA.shape), 0])
+            LL += LLL
+
+        # MPI reduction of gradients
+        self.ob_grad.allreduce()
+        self.pr_grad.allreduce()
+        parallel.allreduce(LL)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                self.ob_grad.storages[name].data += self.regularizer.grad(
+                    s.data)
+                LL += self.regularizer.LL
+        self.LL = LL / self.tot_measpts
+
+        return error_dct
+
+    def poly_line_coeffs(self, ob_h, pr_h):
+        """
+        Compute the coefficients of the polynomial for line minimization
+        in direction h
+        """
+
+        B = np.zeros((3,), dtype=np.longdouble)
+        Brenorm = 1. / self.LL[0]**2
+
+        # Outer loop: through diffraction patterns
+        for dname, diff_view in self.di.views.items():
+            if not diff_view.active:
+                continue
+
+            # Weights and amplitudes for this view
+            w = self.weights[diff_view]
+            A = np.sqrt(diff_view.data)
+
+            A0 = None
+            A1 = None
+            A2 = None
+
+            for name, pod in diff_view.pods.items():
+                if not pod.active:
+                    continue
+                f = pod.fw(pod.probe * pod.object)
+                a = pod.fw(pod.probe * ob_h[pod.ob_view]
+                           + pr_h[pod.pr_view] * pod.object)
+                b = pod.fw(pr_h[pod.pr_view] * ob_h[pod.ob_view])
+
+                if A0 is None:
+                    A0 = u.abs2(f).astype(np.longdouble)
+                    A1 = 2 * np.real(f * a.conj()).astype(np.longdouble)
+                    A2 = (2 * np.real(f * b.conj()).astype(np.longdouble)
+                          + u.abs2(a).astype(np.longdouble))
+                else:
+                    A0 += u.abs2(f)
+                    A1 += 2 * np.real(f * a.conj())
+                    A2 += 2 * np.real(f * b.conj()) + u.abs2(a)
+
+            if self.p.floating_intensities:
+                A0 *= self.float_intens_coeff[dname]
+                A1 *= self.float_intens_coeff[dname]
+                A2 *= self.float_intens_coeff[dname]
+
+            A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6
+            DA = 1. - A/np.sqrt(A0)
+
+            B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm
+            B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm
+            B[2] += (np.dot(w.flat, (A2*DA).flat) + .25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm
+
+        parallel.allreduce(B)
+
+        # Object regularizer
+        if self.regularizer:
+            for name, s in self.ob.storages.items():
+                B += Brenorm * self.regularizer.poly_line_coeffs(
+                    ob_h.storages[name].data, s.data)
+
+        self.B = B
+
+        return B
+
+
 class Regul_del2(object):
     """\
     Squared gradient regularizer (Gaussian prior).
diff --git a/templates/engines/moonflower_ML_Euclid.py b/templates/engines/moonflower_ML_Euclid.py
new file mode 100644
index 000000000..edcb076f0
--- /dev/null
+++ b/templates/engines/moonflower_ML_Euclid.py
@@ -0,0 +1,62 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+#import ptypy
+from ptypy.core import Ptycho
+from ptypy import utils as u
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+
+# saving intermediate results
+p.io.autosave = u.Param(active=False)
+
+# opens plotting GUI if interaction set to active)
+p.io.autoplot = u.Param(active=True)
+p.io.interaction = u.Param(active=True)
+
+# max 100 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML'
+p.engines.engine00.ML_type = 'Euclid'
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+#p.engines.engine00.scale_probe_object = 1.
+p.engines.engine00.smooth_gradient = 20.
+p.engines.engine00.smooth_gradient_decay = 1/50.
+p.engines.engine00.floating_intensities = False
+p.engines.engine00.numiter = 300
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)

From f6a33761d37d78c5baa63f5a76cb0161c1052195 Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Wed, 19 Jul 2023 18:23:49 +0100
Subject: [PATCH 12/37] Count CPU number as usable by current process but not
 whole system (#493)

---
 ptypy/experiment/hdf5_loader.py | 46 ++++++++++++++++-----------------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py
index c72949d70..44344477f 100644
--- a/ptypy/experiment/hdf5_loader.py
+++ b/ptypy/experiment/hdf5_loader.py
@@ -55,14 +55,14 @@ class Hdf5Loader(PtyScan):
     [positions]
     default =
     type = Param
-    help = Parameters for the position information data. 
-    doc = Shapes for each axis that are currently covered and tested corresponding 
+    help = Parameters for the position information data.
+    doc = Shapes for each axis that are currently covered and tested corresponding
           to the intensity shapes are:
             * axis_data.shape (A, B) for data.shape (A, B, frame_size_m, frame_size_n),
             * axis_data.shape (k,) for data.shape (k, frame_size_m, frame_size_n),
             * axis_data.shape (C, D) for data.shape (C*D, frame_size_m, frame_size_n) ,
             * axis_data.shape (C,) for data.shape (C, D, frame_size_m, frame_size_n) where D is the
-              size of the other axis, and 
+              size of the other axis, and
             * axis_data.shape (C,) for data.shape (C*D, frame_size_m, frame_size_n) where D is the
               size of the other axis.
 
@@ -114,7 +114,7 @@ class Hdf5Loader(PtyScan):
     [mask]
     default =
     type = Param
-    help = Parameters for mask data. 
+    help = Parameters for mask data.
     doc = The shape of the loaded data is assumed to be (frame_size_m, frame_size_n) or the same
           shape of the full intensities data.
 
@@ -153,7 +153,7 @@ class Hdf5Loader(PtyScan):
     [darkfield]
     default =
     type = Param
-    help = Parameters for darkfield data. 
+    help = Parameters for darkfield data.
     doc = The shape is assumed to be (frame_size_m, frame_size_n) or the same
           shape of the full intensities data.
 
@@ -190,7 +190,7 @@ class Hdf5Loader(PtyScan):
     help = Sigma value applied for automatic detection of outliers in the normalisation dataset.
 
     [framefilter]
-    default = 
+    default =
     type = Param
     help = Parameters for the filtering of frames
     doc = The shape of loaded data is assumed to hvae the same dimensionality as data.shape[:-2]
@@ -198,7 +198,7 @@ class Hdf5Loader(PtyScan):
     [framefilter.file]
     default = None
     type = str
-    help = This is the path to the file containing the filter information. 
+    help = This is the path to the file containing the filter information.
 
     [framefilter.key]
     default = None
@@ -210,7 +210,7 @@ class Hdf5Loader(PtyScan):
     type = Param
     help = This parameter contains information if we are use the recorded energy rather than as a parameter.
             It should be a scalar value.
-    
+
     [recorded_energy.file]
     default = None
     type = str
@@ -236,7 +236,7 @@ class Hdf5Loader(PtyScan):
     type = Param
     help = This parameter contains information if we are use the recorded distance to the detector rather than as a parameter,
             It should be a scalar value.
-    
+
     [recorded_distance.file]
     default = None
     type = str
@@ -257,7 +257,7 @@ class Hdf5Loader(PtyScan):
     type = Param
     help = This parameter contains information if we are use the recorded psize to the detector rather than as a parameter,
             It should be a scalar value.
-    
+
     [recorded_psize.file]
     default = None
     type = str
@@ -296,8 +296,8 @@ class Hdf5Loader(PtyScan):
     type = bool
     default = False
     help = Switch for loading data from electron ptychography experiments.
-    doc = If True, the energy provided in keV will be considered as electron energy 
-          and converted to electron wavelengths.    
+    doc = If True, the energy provided in keV will be considered as electron energy
+          and converted to electron wavelengths.
     """
 
     def __init__(self, pars=None, swmr=False, **kwargs):
@@ -330,7 +330,7 @@ def __init__(self, pars=None, swmr=False, **kwargs):
         self.framefilter = None
         self._is_spectro_scan = False
         self._is_swmr = swmr
-        
+
         self.fhandle_intensities = None
         self.fhandle_positions_fast = None
         self.fhandle_positions_slow = None
@@ -549,7 +549,7 @@ def _prepare_meta_info(self):
                 self.p.distance = float(f[self.p.recorded_distance.key][()].item() * self.p.recorded_distance.multiplier)
             self.meta.distance = self.p.distance
             log(3, "loading distance={} from file".format(self.p.distance))
-        
+
         if None not in [self.p.recorded_psize.file, self.p.recorded_psize.key]:
             with h5.File(self.p.recorded_psize.file, 'r', swmr=self._is_swmr) as f:
                 self.p.psize = float(f[self.p.recorded_psize.key][()].item() * self.p.recorded_psize.multiplier)
@@ -870,7 +870,7 @@ def _finalize(self):
 class Hdf5LoaderFast(Hdf5Loader):
     def __init__(self, pars=None, **kwargs):
         super().__init__(pars=pars, **kwargs)
-        self.cpu_count_per_rank = max(os.cpu_count() // parallel.size,1)
+        self.cpu_count_per_rank = max(len(os.sched_getaffinity(0)) // parallel.size,1)
         print("Rank %d has access to %d processes" %(parallel.rank, self.cpu_count_per_rank))
         self.intensities_array = None
         self.weights_array = None
@@ -886,13 +886,13 @@ def subtract_dark(raw, dark):
         return corr
 
     @staticmethod
-    def _init_worker(intensities_raw_array, weights_raw_array, 
+    def _init_worker(intensities_raw_array, weights_raw_array,
                      intensities_handle,
                      weights_handle,
                      darkfield_handle,
                      flatfield_handle,
                      intensities_dtype, weights_dtype,
-                     array_shape, 
+                     array_shape,
                      mask_laid_out_like_data,
                      darkfield_laid_out_like_data,
                      flatfield_laid_out_like_data):
@@ -909,7 +909,7 @@ def _init_worker(intensities_raw_array, weights_raw_array,
     @staticmethod
     def _read_intensities_and_weights(slices):
         '''
-        Copy intensities/weights into memory and correct for 
+        Copy intensities/weights into memory and correct for
         darkfield/flatfield if they exist
         '''
         indexed_frame_slices, dest_slices = slices
@@ -961,7 +961,7 @@ def _setup_raw_intensity_buffer(self, dtype, sh):
             return
         self._intensities_raw_array = RawArray(np.ctypeslib.as_ctypes_type(dtype), npixels)
         self.intensities_array = np.frombuffer(self._intensities_raw_array, self.intensities_dtype, -1).reshape(sh)
-        
+
     def _setup_raw_weights_buffer(self, dtype, sh):
         npixels = int(np.prod(sh))
         if (self.weights_array is not None) and (self.weights_array.size == npixels):
@@ -979,13 +979,13 @@ def load_multiprocessing(self, src_slices):
         self._setup_raw_weights_buffer(self.mask_dtype, sh)
         dest_slices = [np.s_[i:i+1] for i in range(len(src_slices))]
 
-        with Pool(self.cpu_count_per_rank, 
+        with Pool(self.cpu_count_per_rank,
                   initializer=Hdf5LoaderFast._init_worker,
                   initargs=(self._intensities_raw_array, self._weights_raw_array,
                             self.intensities, self.mask, self.darkfield, self.flatfield,
                             self.intensities_dtype, self.mask_dtype,
                             sh, self.mask_laid_out_like_data,
-                            self.darkfield_laid_out_like_data, 
+                            self.darkfield_laid_out_like_data,
                             self.flatfield_field_laid_out_like_data)) as p:
             p.map(self._read_intensities_and_weights, zip(src_slices, dest_slices))
 
@@ -1013,7 +1013,7 @@ def load_unmapped_raster_scan(self, indices):
                                       self.fast_axis[slow_idx, fast_idx] * self.p.positions.fast_multiplier])
         log(3, 'Data loaded successfully.')
         return intensities, positions, weights
-    
+
     def load_mapped_and_raster_scan(self, indices):
 
         slices = []
@@ -1024,7 +1024,7 @@ def load_mapped_and_raster_scan(self, indices):
             if self._is_spectro_scan and self.p.outer_index is not None:
                 indexed_frame_slices = (self.p.outer_index,) + indexed_frame_slices
             slices.append(indexed_frame_slices)
-        
+
         self.load_multiprocessing(slices)
 
         intensities = {}

From 771cfb674d00a8ba6d45af707493cf7cd3aa7c94 Mon Sep 17 00:00:00 2001
From: Benedikt Daurer <benedikt.daurer@diamond.ac.uk>
Date: Thu, 10 Aug 2023 14:28:04 +0100
Subject: [PATCH 13/37] fix indentation for benchmarks

---
 ptypy/core/ptycho.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index f4034e23e..9686823a3 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -532,7 +532,7 @@ def init_data(self, print_stats=True):
         with LogTime(self.p.io.benchmark == 'all') as t:
             while not self.new_data:
                 self.new_data = self.model.new_data()
-            if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration
+        if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration
 
         # Print stats
         parallel.barrier()

From 94f3b83b80c18b6caabbd2560d58861621e55831 Mon Sep 17 00:00:00 2001
From: Benedikt Daurer <benedikt.daurer@diamond.ac.uk>
Date: Thu, 10 Aug 2023 14:31:35 +0100
Subject: [PATCH 14/37] dump numbers for benchmarks

---
 benchmark/diamond_benchmarks/moonflower_scripts/i08.py   | 1 +
 benchmark/diamond_benchmarks/moonflower_scripts/i13.py   | 1 +
 benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py | 1 +
 benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py | 1 +
 4 files changed, 4 insertions(+)

diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i08.py b/benchmark/diamond_benchmarks/moonflower_scripts/i08.py
index 273a8ecbf..193c5693e 100644
--- a/benchmark/diamond_benchmarks/moonflower_scripts/i08.py
+++ b/benchmark/diamond_benchmarks/moonflower_scripts/i08.py
@@ -28,6 +28,7 @@
 p.io.autoplot = u.Param(active=False)
 p.io.interaction = u.Param()
 p.io.interaction.server = u.Param(active=False)
+p.io.benchmark = "all"
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i13.py b/benchmark/diamond_benchmarks/moonflower_scripts/i13.py
index 1cf42d5e4..edb0cd1e4 100644
--- a/benchmark/diamond_benchmarks/moonflower_scripts/i13.py
+++ b/benchmark/diamond_benchmarks/moonflower_scripts/i13.py
@@ -28,6 +28,7 @@
 p.io.autoplot = u.Param(active=False)
 p.io.interaction = u.Param()
 p.io.interaction.server = u.Param(active=False)
+p.io.benchmark = "all"
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py
index 9d1abcccb..eaa848f4a 100644
--- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py
+++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py
@@ -28,6 +28,7 @@
 p.io.autoplot = u.Param(active=False)
 p.io.interaction = u.Param()
 p.io.interaction.server = u.Param(active=False)
+p.io.benchmark = "all"
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()
diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py
index 8e3c7241e..fcf483c47 100644
--- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py
+++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py
@@ -29,6 +29,7 @@
 p.io.autoplot = u.Param(active=False)
 p.io.interaction = u.Param()
 p.io.interaction.server = u.Param(active=False)
+p.io.benchmark = "all"
 
 # max 200 frames (128x128px) of diffraction data
 p.scans = u.Param()

From 148239f1313983cb6437ff5b30bb2c3e238fa6d9 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 12 Oct 2023 15:55:30 +0100
Subject: [PATCH 15/37] Update CONTRIB.rst (#508)

Co-authored-by: Bjoern Enders <benders@lbl.gov>
---
 CONTRIB.rst | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/CONTRIB.rst b/CONTRIB.rst
index a9aada0a4..1f26aebf5 100644
--- a/CONTRIB.rst
+++ b/CONTRIB.rst
@@ -26,24 +26,25 @@ Please ensure you satisfy most of PEP8_ recommendations. We are not dogmatic abo
 Testing
 ^^^^^^^
 
-Not much testing exists at the time of writing this document, but we are aware that this is something that should change. If you want to contribute code, it would be very good practice to also submit related tests.
+All tests are in the (``/test/``) folder and our CI pipeline runs these test for every commit (?). Please note that tests that require GPUs are disabled for the CI pipeline. Make sure to supply tests for new code or drastic changes to the existing code base. Smaller commits or bug fixes don't require an extra test.
 
 Branches
 ^^^^^^^^
 
+We are following the Gitflow https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow development model where a development branch (``dev``) is merged into the master branch for every release. Individual features are developed on topic branches from the development branch and squash-merged back into it when the feature is mature
+
 The important permanent branches are:
- - ``master``: the current cutting-edge but functional package.
- - ``stable``: the latest release, recommended for production use.
- - ``target``: target for a next release. This branch should stay up-to-date with ``master``, and contain planned updates that will break compatibility with the current version.
- - other thematic and temporary branches will appear and disappear as new ideas are tried out and merged in.
+ - ``master``: (protected) the current release plus bugfixes / hotpatches.
+ - ``dev``: (protected) current branch for all developments. Features are branched this branch and merged back into it upon completion.
 
 
 Development cycle
 ^^^^^^^^^^^^^^^^^
 
-There has been only two releases of the code up to now, so what we can tell about the *normal development cycle* for |ptypy| is rather limited. However the plan is as follows:
- - Normal development usually happens on thematic branches. These branches are merged back to master when it is clear that (1) the feature is sufficiently debugged and tested and (2) no current functionality will break.
- - At regular interval admins will decide to freeze the development for a new stable release. During this period, development will be allowed only on feature branches but master will accept only bug fixes. Once the stable release is done, development will continue.
+|ptypy| does not follow a rigid release schedule. Releases are prepared for major event or when a set of features have reached maturity.
+
+ - Normal development usually happens on thematic branches from the ``dev`` branch. These branches are merged back to ``dev`` when it is clear that (1) the feature is sufficiently debugged and tested and (2) no current functionality will break.
+ - For a release the dev branch will be merged back into master and that merge tagged as a release.
 
 
 3. Pull requests
@@ -51,16 +52,9 @@ There has been only two releases of the code up to now, so what we can tell abou
 
 Most likely you are a member of the |ptypy| team, which give you access to the full repository, but no right to commit changes. The proper way of doing this is *pull requests*. You can read about how this is done on github's `pull requests tutorial`_.
 
-Pull requests can be made against one of the feature branches, or against ``target`` or ``master``. In the latter cases, if your changes are deemed a bit too substantial, the first thing we will do is create a feature branch for your commits, and we will let it live for a little while, making sure that it is all fine. We will then merge it onto ``master`` (or ``target``).
-
-In principle bug fixes can be requested on the ``stable`` branch. 
-
-3. Direct commits
------------------
-
-If you are one of our power-users (or power-developers), you can be given rights to commit directly to |ptypy|. This makes things much simpler of course, but with great power comes great responsibility.
+Pull requests shall be made against one of the feature branches, or against ``dev`` or ``master``. For PRs against master we will only accept bugifxes or smaller changes. Every other PR should be made against ``dev``. Your PR will be reviewed and discussed anmongst the core developer team. The more you touch core libraries, the more scrutiny your PR will face. However, we created two folders in the main source folder where you have mmore freedom to try out things. For example, if you want to provide a new reconstruction engine, place it into the ``custom/`` folder. A new ``PtyScan`` subclass that prepares data from your experiment is best placed in the ``experiment/`` folder.
 
-To make sure that things are done cleanly, we encourage all the core developers to create thematic remote branches instead of committing always onto master. Merging these thematic branches will be done as a collective decision during one of the regular admin meetings.
+If you develop a new feature on a topic branch, it is your responsibility to keep it current with dev branch to avoid merge conflicts. 
 
 
 .. |ptypy| replace:: PtyPy
@@ -68,4 +62,4 @@ To make sure that things are done cleanly, we encourage all the core developers
 
 .. _PEP8: https://www.python.org/dev/peps/pep-0008/
 
-.. _`pull requests tutorial`: https://help.github.com/articles/using-pull-requests/
\ No newline at end of file
+.. _`pull requests tutorial`: https://help.github.com/articles/using-pull-requests/

From c544fefde6dafba46c53b64acccab543aa8ec8a2 Mon Sep 17 00:00:00 2001
From: Thomas Milburn <milburn.j.thomas@gmail.com>
Date: Thu, 12 Oct 2023 15:59:37 +0100
Subject: [PATCH 16/37] Updated docstrings which are missing choices (#507)

* Added more choices to comments

---------

Co-authored-by: Thomas Milburn <thomas.milburn@diamond.ac.uk>
---
 archive/cuda_extension/engines/DM_gpu.py | 1 +
 archive/cuda_extension/engines/DM_npy.py | 1 +
 archive/engines/DM.py                    | 1 +
 ptypy/core/data.py                       | 2 ++
 ptypy/core/illumination.py               | 1 +
 ptypy/core/manager.py                    | 4 ++++
 ptypy/core/ptycho.py                     | 4 ++++
 ptypy/core/sample.py                     | 1 +
 ptypy/engines/base.py                    | 1 +
 ptypy/engines/projectional.py            | 1 +
 10 files changed, 17 insertions(+)

diff --git a/archive/cuda_extension/engines/DM_gpu.py b/archive/cuda_extension/engines/DM_gpu.py
index 399eb143c..9e81ad7fa 100644
--- a/archive/cuda_extension/engines/DM_gpu.py
+++ b/archive/cuda_extension/engines/DM_gpu.py
@@ -57,6 +57,7 @@ class DMGpu(DMNpy):
     default = 'linear'
     type = str
     help = Subpixel interpolation; 'fourier','linear' or None for no interpolation
+    choices = ['fourier','linear',None]
 
     [update_object_first]
     default = True
diff --git a/archive/cuda_extension/engines/DM_npy.py b/archive/cuda_extension/engines/DM_npy.py
index f601a46dd..6fce4bc5d 100644
--- a/archive/cuda_extension/engines/DM_npy.py
+++ b/archive/cuda_extension/engines/DM_npy.py
@@ -55,6 +55,7 @@ class DMNpy(DM):
     default = 'linear'
     type = str
     help = Subpixel interpolation; 'fourier','linear' or None for no interpolation
+    choices = ['fourier','linear',None]
 
     [update_object_first]
     default = True
diff --git a/archive/engines/DM.py b/archive/engines/DM.py
index 50936bd42..1124158a2 100644
--- a/archive/engines/DM.py
+++ b/archive/engines/DM.py
@@ -55,6 +55,7 @@ class DM(PositionCorrectionEngine):
     default = 'linear'
     type = str
     help = Subpixel interpolation; 'fourier','linear' or None for no interpolation
+    choices = ['fourier','linear',None]
 
     [update_object_first]
     default = True
diff --git a/ptypy/core/data.py b/ptypy/core/data.py
index 636857bca..288def4bf 100644
--- a/ptypy/core/data.py
+++ b/ptypy/core/data.py
@@ -114,6 +114,7 @@ class PtyScan(object):
     default = data
     help = Determines what will be loaded in parallel
     doc = Choose from ``None``, ``'data'``, ``'common'``, ``'all'``
+    choices = ['data', 'common', 'all']
 
     [rebin]
     type = int
@@ -139,6 +140,7 @@ class PtyScan(object):
        <newline>
        Alternatively, a 3-tuple of booleans may be provided ``(do_transpose, 
        do_flipud, do_fliplr)``
+    choices = [0, 1, 2, 3, 4, 5, 6, 7]
     userlevel = 1
 
     [min_frames]
diff --git a/ptypy/core/illumination.py b/ptypy/core/illumination.py
index af5d4c06b..ddf1f98f7 100644
--- a/ptypy/core/illumination.py
+++ b/ptypy/core/illumination.py
@@ -130,6 +130,7 @@
     	 - *<template>* : one of the templates inillumination module
     	
     	In script, you may pass a numpy.ndarray here directly as the model. It is considered as incoming wavefront and will be propagated according to `propagation` with an optional `aperture` applied before.
+    choices = ['recon','stxm',None]
     userlevel = 0
 
     [photons]
diff --git a/ptypy/core/manager.py b/ptypy/core/manager.py
index 0d9d7d341..ec376c8ff 100644
--- a/ptypy/core/manager.py
+++ b/ptypy/core/manager.py
@@ -71,6 +71,7 @@ class ScanModel(object):
     default = farfield
     help = Propagation type
     doc = Either "farfield" or "nearfield"
+    choices = ['farfield', 'nearfield']
     userlevel = 1
 
     [ffttype]
@@ -78,6 +79,7 @@ class ScanModel(object):
     default = scipy
     help = FFT library
     doc = Choose from "numpy", "scipy" or "fftw"
+    choices = ['numpy', 'scipy', 'fftw']
     userlevel = 1
 
     [data]
@@ -906,6 +908,7 @@ class _Full(object):
        - ``'irregular'``: no assumption
       **[not implemented]**
     type = str
+    choices = ['achromatic', 'linear', 'irregular']
     userlevel = 2
 
     [coherence.probe_dispersion]
@@ -917,6 +920,7 @@ class _Full(object):
        - ``'irregular'``: no assumption
       **[not implemented]**
     type = str
+    choices = ['achromatic', 'linear', 'irregular']
     userlevel = 2
 
     [resolution]
diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index 9686823a3..94bbef48a 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -82,6 +82,7 @@ class Ptycho(Base):
        - ``INSPECT``:  Object Information
        - ``DEBUG``:    Debug
     type = str, int
+    choices = ['CRITICAL', 'ERROR', 'WARNING', 'INFO', 'INSPECT', 'DEBUG']
     userlevel = 0
 
     [data_type]
@@ -90,6 +91,7 @@ class Ptycho(Base):
     doc = Reconstruction floating number precision (``'single'`` or
           ``'double'``)
     type = str
+    choices = ['single', 'double']
     userlevel = 1
 
     [run]
@@ -248,6 +250,7 @@ class Ptycho(Base):
     help = Options for default plotter or template name
     doc = Flexible layout for default plotter is not implemented yet. Please choose one of the
       templates ``'default'``,``'black_and_white'``,``'nearfield'``, ``'minimal'`` or ``'weak'``
+    choices = ['default', 'black_and_white', 'nearfield', 'minimal', 'weak']
     userlevel = 2
 
     [io.autoplot.dump]
@@ -269,6 +272,7 @@ class Ptycho(Base):
     help = Produce timings for benchmarking the performance of data loaders and engines
     doc = Switch to get timings and save results to a json file in p.io.home
         Choose ``'all'`` for timing data loading, engine_init, engine_prepare, engine_iterate and engine_finalize
+    choices = ['all', 'loading', 'engine_init', 'engine_prepare', 'engine_iterate', 'engine_finalize']
     userlevel = 2
 
     [scans]
diff --git a/ptypy/core/sample.py b/ptypy/core/sample.py
index bdb14b4f3..4998e65cb 100644
--- a/ptypy/core/sample.py
+++ b/ptypy/core/sample.py
@@ -36,6 +36,7 @@
       processed according to `process` in order to *simulate* a sample from e.g. a thickness
       profile.
     type = str, array
+    choices = ['recon', 'stxm', 'None']
     userlevel = 0
 
     [fill]
diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py
index a7ffb6cee..67426c0ba 100644
--- a/ptypy/engines/base.py
+++ b/ptypy/engines/base.py
@@ -376,6 +376,7 @@ class PositionCorrectionEngine(BaseEngine):
     default = "fourier"
     type = str
     help = Error metric, can choose between "fourier" and "photon"
+    choices = ["fourier", "photon"]
     
     [position_refinement.record]
     default = False
diff --git a/ptypy/engines/projectional.py b/ptypy/engines/projectional.py
index ba72aa2db..440831ea4 100644
--- a/ptypy/engines/projectional.py
+++ b/ptypy/engines/projectional.py
@@ -39,6 +39,7 @@ class _ProjectionEngine(PositionCorrectionEngine):
     default = 'linear'
     type = str
     help = Subpixel interpolation; 'fourier','linear' or None for no interpolation
+    choices = ['fourier','linear',None]
 
     [update_object_first]
     default = True

From 4b88db0815f4853cafd8cd33368a00623e465326 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 27 Oct 2023 16:28:36 +0100
Subject: [PATCH 17/37] Add new kind to save all used params into .ptyr (#501)

---
 ptypy/core/ptycho.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index 94bbef48a..d66d68b22 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -157,7 +157,8 @@ class Ptycho(Base):
     doc = Choose a reconstruction file format for after engine completion.
        - ``'minimal'``: Bare minimum of information
        - ``'dls'``:    Custom format for Diamond Light Source
-    choices = 'minimal','dls'
+       - ``'used_params'``: Same as minimal but including all used parameters 
+    choices = 'minimal','dls','used_params'
 
     [io.interaction]
     default = None
@@ -991,7 +992,7 @@ def save_run(self, alt_file=None, kind='minimal', force_overwrite=True):
 
                 content = dump
 
-            elif kind == 'minimal' or kind == 'dls':
+            elif kind in ('minimal', 'dls', 'used_params'):
                 # if self.interactor is not None:
                 #    self.interactor.stop()
                 logger.info('Generating shallow copies of probe, object and '
@@ -1006,7 +1007,7 @@ def save_run(self, alt_file=None, kind='minimal', force_overwrite=True):
                     defaults_tree['ptycho'].validate(self.p) # check the parameters are actually able to be read back in
                 except RuntimeError:
                     logger.warning("The parameters we are saving won't pass a validator check!")
-                minimal.pars = self.p.copy()  # _to_dict(Recursive=True)
+                minimal.pars = self.p.copy(depth=99)  # _to_dict(Recursive=True)
                 minimal.runtime = self.runtime.copy()
 
                 content = minimal
@@ -1020,6 +1021,13 @@ def save_run(self, alt_file=None, kind='minimal', force_overwrite=True):
                 for ID, S in self.obj.storages.items():
                     content.obj[ID]['grids'] = S.grids()
 
+            if kind == 'used_params':
+                for name, engine in self.engines.items():
+                    content.pars.engines[name] = engine.p
+                for name, scan in self.model.scans.items():
+                    content.pars.scans[name] = scan.p
+                    content.pars.scans[name].data = scan.ptyscan.p
+
             if kind in ['minimal', 'dls'] and self.record_positions:
                 content.positions = {}
                 for ID, S in self.obj.storages.items():

From d6da8287fec8f749647c46ac92bcd36fc9b8d46a Mon Sep 17 00:00:00 2001
From: Maik Kahnt <kahntm@gmail.com>
Date: Fri, 27 Oct 2023 17:30:17 +0200
Subject: [PATCH 18/37] if refined positions are saved, they are also saved to
 dump files (#509)

---
 ptypy/core/ptycho.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index d66d68b22..9789dbecc 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -990,6 +990,11 @@ def save_run(self, alt_file=None, kind='minimal', force_overwrite=True):
                 if len(self.runtime.iter_info) > 0:
                     dump.runtime.iter_info = [self.runtime.iter_info[-1]]
 
+                if self.record_positions:
+                    dump.positions = {}
+                    for ID, S in self.obj.storages.items():
+                        dump.positions[ID] = np.array([v.coord for v in S.views if v.pod.pr_view.layer==0])
+
                 content = dump
 
             elif kind in ('minimal', 'dls', 'used_params'):

From 2ab0bb42db962a32aa839aead5fedd7cfa75bf1a Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Wed, 1 Nov 2023 09:22:25 +0000
Subject: [PATCH 19/37] [WIP] Make it easier to do parameter sweeps (#506)

* Add function for dumping state
* added functionality to dump and restore the state of a recon
* rolled back deep copies and save/restore copies into a dict instead
---
 ptypy/core/ptycho.py                          |  41 ++-
 .../moonflower_raar_parameter_sweep.ipynb     | 264 ++++++++++++++++++
 2 files changed, 304 insertions(+), 1 deletion(-)
 create mode 100644 templates/misc/moonflower_raar_parameter_sweep.ipynb

diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index 9789dbecc..e9b725145 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -356,6 +356,7 @@ def __init__(self, pars=None, level=2, **kwargs):
         self.mask = None
         self.model = None
         self.new_data = None
+        self.state_dict = dict()
 
         # Communication
         self.interactor = None
@@ -1115,7 +1116,45 @@ def plot_overview(self, fignum=100):
                            cmap='gray')
             fignum += 1
 
-    
+
+    def copy_state(self, name="baseline", overwrite=False):
+        """
+        Store a copy of the current state of object/probe
+
+        Warning: This feature is under development and syntax might change!
+        """
+        if name in self.state_dict:
+            logger.warning("A state with name {:s} exists already".format(name))
+            if overwrite:
+                logger.warning("Overwrite {:s} state".format(name))                
+                del self.state_dict[name]
+            else:
+                return
+        self.state_dict[name] = {}
+        self.state_dict[name]["ob"] = self.obj.copy()
+        self.state_dict[name]["pr"] = self.probe.copy()
+        self.state_dict[name]["runtime"] = self.runtime.copy(depth=99)
+        logger.info("Saved a copy of object and probe as the {:s} state".format(name))
+            
+    def restore_state(self, name="baseline", reformat_exit=True):
+        """
+        Restore object/probe based on a previously saved copy
+
+        Warning: This feature is under development and syntax might change!
+        """
+        if name in self.state_dict:
+            for ID,S in self.probe.storages.items():
+                S.data[:] = self.state_dict[name]["pr"].storages[ID].data
+            for ID,S in self.obj.storages.items():
+                S.data[:] = self.state_dict[name]["ob"].storages[ID].data
+        self.runtime = self.state_dict[name]["runtime"]
+        
+        # Reformat/Recalculate exit waves
+        if reformat_exit:
+            self.exit.reformat()
+            for scan in self.model.scans.values():
+                scan._initialize_exit(list(self.pods.values()))
+
     def _redistribute_data(self, div = 'rect', obj_storage=None):
         """
         This function redistributes data among nodes, so that each
diff --git a/templates/misc/moonflower_raar_parameter_sweep.ipynb b/templates/misc/moonflower_raar_parameter_sweep.ipynb
new file mode 100644
index 000000000..3bfdce563
--- /dev/null
+++ b/templates/misc/moonflower_raar_parameter_sweep.ipynb
@@ -0,0 +1,264 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## PtyPy moonflower example\n",
+    "#### scan model: BlockFull\n",
+    "#### engine: Relaxed Averaged Alternate Projections (RAAR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ptypy.core import Ptycho\n",
+    "from ptypy import utils as u"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create parameter tree\n",
+    "p = u.Param()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set verbose level to interactive\n",
+    "p.verbose_level = \"interactive\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# set home path and io settings (no files saved)\n",
+    "p.io = u.Param()\n",
+    "p.io.rfile = None\n",
+    "p.io.autosave = u.Param(active=False)\n",
+    "p.io.interaction = u.Param(active=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.io.autoplot = u.Param()\n",
+    "p.io.autoplot.active = True\n",
+    "p.io.autoplot.threaded = False\n",
+    "p.io.autoplot.layout = \"jupyter\"\n",
+    "p.io.autoplot.interval = 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# max 200 frames (128x128px) of diffraction data\n",
+    "p.scans = u.Param()\n",
+    "p.scans.MF = u.Param()\n",
+    "p.scans.MF.name = 'BlockFull'\n",
+    "p.scans.MF.data= u.Param()\n",
+    "p.scans.MF.data.name = 'MoonFlowerScan'\n",
+    "p.scans.MF.data.shape = 128\n",
+    "p.scans.MF.data.num_frames = 200\n",
+    "p.scans.MF.data.save = None\n",
+    "p.scans.MF.data.density = 0.2\n",
+    "p.scans.MF.data.photons = 1e8\n",
+    "p.scans.MF.data.psf = 0."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load/prep data and save initial state"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load and prepare\n",
+    "P = Ptycho(p,level=4)\n",
+    "# Save initial state\n",
+    "P.copy_state(name=\"initial\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run RAAR with beta = 0.9 and save the final state"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# RAAR reconstrucion engine\n",
+    "engine_pars = u.Param()\n",
+    "engine_pars.name = 'RAAR'\n",
+    "engine_pars.numiter = 100\n",
+    "engine_pars.beta = 0.9\n",
+    "P.run(epars=engine_pars)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "P.copy_state(name=\"RAAR with beta 0.9\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run RAAR with beta = 0.7 and save the final state"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "P.restore_state(name=\"initial\")\n",
+    "# Check that we are indeed starting from beginning again\n",
+    "fig = u.plot_client.figure_from_ptycho(P)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# RAAR reconstrucion engine\n",
+    "engine_pars = u.Param()\n",
+    "engine_pars.name = 'RAAR'\n",
+    "engine_pars.numiter = 100\n",
+    "engine_pars.beta = 0.7\n",
+    "P.run(epars=engine_pars)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "P.copy_state(name=\"RAAR with beta 0.7\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Compare reconstructions with different beta"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "fig, axes = plt.subplots(ncols=2, nrows=3, figsize=(4,6))\n",
+    "axes[0,0].set_title(\"beta = 0.9\")\n",
+    "axes[0,0].imshow(np.abs(P.state_dict[\"RAAR with beta 0.9\"][\"ob\"].storages[\"SMFG00\"].data[0,100:-100,100:-100]))\n",
+    "axes[1,0].imshow(np.angle(P.state_dict[\"RAAR with beta 0.9\"][\"ob\"].storages[\"SMFG00\"].data[0,100:-100,100:-100]))\n",
+    "ax10 = u.PtyAxis(axes[2,0], channel=\"c\")\n",
+    "ax10.set_data(P.state_dict[\"RAAR with beta 0.9\"][\"pr\"].storages[\"SMFG00\"].data[0,20:-20,20:-20])\n",
+    "axes[0,1].set_title(\"beta = 0.7\")\n",
+    "axes[0,1].imshow(np.abs(P.state_dict[\"RAAR with beta 0.7\"][\"ob\"].storages[\"SMFG00\"].data[0,100:-100,100:-100]))\n",
+    "axes[1,1].imshow(np.angle(P.state_dict[\"RAAR with beta 0.7\"][\"ob\"].storages[\"SMFG00\"].data[0,100:-100,100:-100]))\n",
+    "ax11 = u.PtyAxis(axes[2,1], channel=\"c\")\n",
+    "ax11.set_data(P.state_dict[\"RAAR with beta 0.7\"][\"pr\"].storages[\"SMFG00\"].data[0,20:-20,20:-20])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iters_0_7 = np.array([it[\"iterations\"] for it in P.state_dict[\"RAAR with beta 0.7\"][\"runtime\"][\"iter_info\"]])\n",
+    "error_0_7 = np.array([it[\"error\"] for it in P.state_dict[\"RAAR with beta 0.7\"][\"runtime\"][\"iter_info\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "iters_0_9 = np.array([it[\"iterations\"] for it in P.state_dict[\"RAAR with beta 0.9\"][\"runtime\"][\"iter_info\"]])\n",
+    "error_0_9 = np.array([it[\"error\"] for it in P.state_dict[\"RAAR with beta 0.9\"][\"runtime\"][\"iter_info\"]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure()\n",
+    "plt.plot(iters_0_7, error_0_7[:,1], label=\"beta=0.7\")\n",
+    "plt.plot(iters_0_9, error_0_9[:,1], label=\"beta=0.9\")\n",
+    "plt.semilogy()\n",
+    "plt.ylabel(\"Maximum Likelihood error\")\n",
+    "plt.xlabel(\"Nr. of iterations\")\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel) [User Conda - cuda11.7]",
+   "language": "python",
+   "name": "conda-env-User_Conda_-_cuda11.7-python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

From 7f41f07ac5f85eab4af467bf4ba008fd5c6d2c72 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 15 Dec 2023 14:58:03 +0000
Subject: [PATCH 20/37] increase range for rebinning (#514)

---
 ptypy/core/data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ptypy/core/data.py b/ptypy/core/data.py
index 288def4bf..3b55495e0 100644
--- a/ptypy/core/data.py
+++ b/ptypy/core/data.py
@@ -123,7 +123,7 @@ class PtyScan(object):
     doc = Rebinning factor for the raw data frames. ``'None'`` or ``1`` both mean *no binning*
     userlevel = 1
     lowlim = 1
-    uplim = 8
+    uplim = 32
 
     [orientation]
     type = int, tuple, list
@@ -799,7 +799,7 @@ def get_data_chunk(self, chunksize, start=None):
             rebin = self.rebin
             if rebin <= 1:
                 pass
-            elif (rebin in range(2, 6)
+            elif (rebin in range(2, 32+1)
                   and (((sh / float(rebin)) % 1) == 0.0).all()):
                 mask = w > 0
                 d = u.rebin_2d(d, rebin)

From 1ade54b85b1c4d6b40d6e3142b575faf6d17faf9 Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Fri, 15 Dec 2023 14:59:58 +0000
Subject: [PATCH 21/37] Remove legacy scipy.fftpack (#516)

---
 ptypy/core/geometry.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ptypy/core/geometry.py b/ptypy/core/geometry.py
index a3f1a9641..ab7852216 100644
--- a/ptypy/core/geometry.py
+++ b/ptypy/core/geometry.py
@@ -7,7 +7,7 @@
     :license: see LICENSE for details.
 """
 import numpy as np
-from scipy import fftpack
+import scipy.fft
 
 from .. import utils as u
 from ..utils.verbose import logger
@@ -55,7 +55,7 @@ class Geo(Base):
         If set to True, changes to properties like :py:meth:`energy`,
         :py:meth:`lam`, :py:meth:`shape` or :py:meth:`psize` will cause
         a call to :py:meth:`update`.
-    
+
 
     Default geometry parameters. See also :py:data:`.scan.geometry`
 
@@ -471,8 +471,8 @@ def _FFTW_fft(self):
         self.ifft = lambda x: fftw_np.ifft2(x, planner_effort=pe)
 
     def _scipy_fft(self):
-        self.fft = lambda x: fftpack.fft2(x).astype(x.dtype)
-        self.ifft = lambda x: fftpack.ifft2(x).astype(x.dtype)
+        self.fft = lambda x: scipy.fft.fft2(x).astype(x.dtype)
+        self.ifft = lambda x: scipy.fft.ifft2(x).astype(x.dtype)
 
     def _numpy_fft(self):
         self.fft = lambda x: np.ascontiguousarray(np.fft.fft2(x).astype(x.dtype))

From ae7cff629cede43daba5b43b75118be358fb5dbd Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 15 Dec 2023 15:57:41 +0000
Subject: [PATCH 22/37] Starting point for implementation of multislice ePIE
 (#500)

* initial implementation of multislice ePIE

* code runs but probably still bug for slices > 1

* save slices infomation and fix the update loop

* object as product of all slices at each iteration

* iterating over pods to allow for modes

* swapped loops

* added the 3PIE article to the engine

* renamed the engine to match the algorithm name in the literature

* renamed the file to match the engine and algorithm name

* python convention for class names

* file name as class name

* added semi-functioning switching on of slices at arbitrary iterations

* allow non equal slice spacing

* changed filenames

---------

Co-authored-by: Yiran Lu <19369063+yiranlus@users.noreply.github.com>
Co-authored-by: kahntm <kahntm@gmail.com>
---
 ptypy/custom/threepie.py              | 230 ++++++++++++++++++++++++++
 templates/misc/moonflower_ThreePIE.py |  65 ++++++++
 2 files changed, 295 insertions(+)
 create mode 100644 ptypy/custom/threepie.py
 create mode 100644 templates/misc/moonflower_ThreePIE.py

diff --git a/ptypy/custom/threepie.py b/ptypy/custom/threepie.py
new file mode 100644
index 000000000..19c572ba9
--- /dev/null
+++ b/ptypy/custom/threepie.py
@@ -0,0 +1,230 @@
+# -*- coding: utf-8 -*-
+"""
+A simple implementation of Multislice for the
+ePIE algorithm.
+
+authors: Benedikt J. Daurer and more...
+"""
+from ptypy.engines import stochastic
+from ptypy.engines import register
+from ptypy.core import geometry
+from ptypy.utils import Param
+from ptypy.utils.verbose import logger
+from ptypy import io
+import numpy as np
+
+@register()
+class ThreePIE(stochastic.EPIE):
+    """
+    An extension of EPIE to include multislice
+
+    Defaults:
+
+    [name]
+    default = ThreePIE
+    type = str
+    help =
+    doc =
+
+    [number_of_slices]
+    default = 2
+    type = int
+    help = The number of slices
+    doc = Defines how many slices are used for the multi-slice object.
+
+    [slice_thickness]
+    default = 1e-6
+    type = float, list, tuple
+    help = Thickness of a single slice in meters
+    doc = A single float value or a list of float values. If a single value is used, all the slice will be assumed to be of the same thickness.
+
+    [slice_start_iteration]
+    default = 0
+    type = int, list, tuple
+    help = iteration number to start using a specific slice
+    doc =
+
+    [fslices]
+    default = slices.h5
+    type = str
+    help = File path for the slice data
+    doc =
+
+    """
+    def __init__(self, ptycho_parent, pars=None):
+        super(ThreePIE, self).__init__(ptycho_parent, pars)
+        self.article = dict(
+            title='{Ptychographic transmission microscopy in three dimensions using a multi-slice approach',
+            author='A. M. Maiden et al.',
+            journal='J. Opt. Soc. Am. A',
+            volume=29,
+            year=2012,
+            page=1606,
+            doi='10.1364/JOSAA.29.001606',
+            comment='The 3PIE reconstruction algorithm',
+        )
+        self.ptycho.citations.add_article(**self.article)
+
+    def engine_initialize(self):
+        super().engine_initialize()
+
+        # Create a list of objects and exit waves (one for each slice)
+        self._object = [None] * self.p.number_of_slices
+        self._probe = [None] * self.p.number_of_slices
+        self._exits = [None] * self.p.number_of_slices
+        for i in range(self.p.number_of_slices):
+            self._object[i] = self.ob.copy(self.ob.ID + "_o_" + str(i))
+            self._probe[i] = self.pr.copy(self.pr.ID + "_p_" + str(i))
+            self._exits[i] = self.pr.copy(self.pr.ID + "_e_" + str(i))
+
+        # ToDo:
+        #    - allow for non equal slice spacing
+        #    - allow for start_slice_update at a freely chosen iteration
+        #      for each slice separately - works, but not if the
+        #      most downstream slice is switched off
+
+        if isinstance(self.p.slice_start_iteration, int):
+            self.p.slice_start_iteration = np.ones(self.p.number_of_slices) * self.p.slice_start_iteration
+        #if ĺen(self.p.slice_start_iteration) != self.p.number_of_slices:
+        #    logger.info(f'dimension of given slice_start_iteration ({ĺen(self.p.slice_start_iteration)}) does not match number of slices ({self.p.number_of_slices})')
+
+        scan = list(self.ptycho.model.scans.values())[0]
+        geom = scan.geometries[0]
+        g = Param()
+        g.energy = geom.energy
+        g.distance = self.p.slice_thickness
+        g.psize = geom.resolution
+        g.shape = geom.shape
+        g.propagation = "nearfield"
+
+        self.fw = []
+        self.bw = []
+        if type(self.p.slice_thickness) in [list, tuple]:
+            assert(len(self.p.slice_thickness) == self.p.number_of_slices-1)
+            for thickness in self.p.slice_thickness:
+                g.distance = thickness
+                G = geometry.Geo(owner=None, pars=g)
+                self.fw.append(G.propagator.fw)
+                self.bw.append(G.propagator.bw)
+        else:
+            g.distance = self.p.slice_thickness
+            G = geometry.Geo(owner=None, pars=g)
+            self.fw = [G.propagator.fw for i in range(self.p.number_of_slices-1)]
+            self.bw = [G.propagator.bw for i in range(self.p.number_of_slices-1)]
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        vieworder = list(self.di.views.keys())
+        vieworder.sort()
+        rng = np.random.default_rng()
+
+        for it in range(num):
+
+            error_dct = {}
+            rng.shuffle(vieworder)
+
+            for name in vieworder:
+                view = self.di.views[name]
+                if not view.active:
+                    continue
+
+                # Multislice update
+                error_dct[name] = self.multislice_update(view)
+
+            self.curiter += 1
+
+        return error_dct
+
+    def engine_finalize(self):
+        self.ob.fill(self._object[0])
+        for i in range(1, self.p.number_of_slices):
+            self.ob *= self._object[i]
+
+        # Save the slices
+        slices_info = Param()
+        slices_info.number_of_slices = self.p.number_of_slices
+        slices_info.slice_thickness = self.p.slice_thickness
+        slices_info.objects = {ob.ID: {ID: S._to_dict() for ID, S in ob.storages.items()}
+                               for ob in self._object}
+        slices_info.slice_start_iteration = self.p.slice_start_iteration
+
+        header = {'description': 'multi-slices result details.'}
+
+        h5opt = io.h5options['UNSUPPORTED']
+        io.h5options['UNSUPPORTED'] = 'ignore'
+        logger.info(f'Saving to {self.p.fslices}')
+        io.h5write(self.p.fslices, header=header, content=slices_info)
+        io.h5options['UNSUPPORTED'] = h5opt
+
+        return super().engine_finalize()
+
+    def multislice_update(self, view):
+        """
+        Performs one 'iteration' of 3PIE (multislice ePIE) for a single view.
+        Based on https://doi.org/10.1364/JOSAA.29.001606
+        """
+
+        for i in range(self.p.number_of_slices-1):
+            for name, pod in view.pods.items():
+                # exit wave for this slice
+                if self.curiter >= self.p.slice_start_iteration[i]:
+                    self._exits[i][pod.pr_view] = self._probe[i][pod.pr_view] * self._object[i][pod.ob_view]
+                else:
+                    self._exits[i][pod.pr_view] = self._probe[i][pod.pr_view] * 1.
+                # incident wave for next slice
+                self._probe[i+1][pod.pr_view] = self.fw[i](self._exits[i][pod.pr_view])
+
+        for name, pod in view.pods.items():
+            # Exit wave for last slice
+            if self.curiter >= self.p.slice_start_iteration[-1]:
+                self._exits[-1][pod.pr_view] = self._probe[-1][pod.pr_view] * self._object[-1][pod.ob_view]
+            else:
+                self._exits[-1][pod.pr_view] = self._probe[-1][pod.pr_view] * 1.
+            # Save final state into pod (need for ptypy fourier update)
+            pod.probe = self._probe[-1][pod.pr_view]
+            pod.object = self._object[-1][pod.ob_view]
+            pod.exit = self._exits[-1][pod.pr_view]
+
+        # Fourier update
+        error = self.fourier_update(view)
+
+        # Object/probe update for the last slice
+        if self.curiter >= self.p.slice_start_iteration[-1]:
+            self.object_update(view, {pod.ID:self._exits[-1][pod.pr_view] for name, pod in view.pods.items()})
+            self.probe_update(view, {pod.ID:self._exits[-1][pod.pr_view] for name, pod in view.pods.items()})
+            for name, pod in view.pods.items():
+                self._object[-1][pod.ob_view] = pod.object
+                self._probe[-1][pod.pr_view] = pod.probe
+        else:
+            for name, pod in view.pods.items():
+                self._probe[-1][pod.pr_view] = pod.exit * 1.
+
+        # Object/probe update for other slices (backwards)
+        for i in range(self.p.number_of_slices-2, -1, -1):
+            if self.curiter >= self.p.slice_start_iteration[i]:
+
+                for name, pod in view.pods.items():
+                    # Backwards propagation of the probe
+                    pod.exit = self.bw[i](self._probe[i+1][pod.pr_view])
+                    # Save state into pods
+                    pod.probe = self._probe[i][pod.pr_view]
+                    pod.object = self._object[i][pod.ob_view]
+
+                # Actual object/probe update
+                self.object_update(view, {pod.ID:self._exits[i][pod.pr_view] for name, pod in view.pods.items()})
+                self.probe_update(view, {pod.ID:self._exits[i][pod.pr_view] for name, pod in view.pods.items()})
+                for name, pod in view.pods.items():
+                    self._object[i][pod.ob_view] = pod.object
+                    self._probe[i][pod.pr_view] = pod.probe
+            else:
+                for name, pod in view.pods.items():
+                    self._probe[i][pod.pr_view] = self.bw[i](self._probe[i+1][pod.pr_view])
+
+        # set the object as the product of all slices for better live plotting
+        self.ob.fill(self._object[0])
+        for i in range(1, self.p.number_of_slices):
+            self.ob *= self._object[i]
+
+        return error
\ No newline at end of file
diff --git a/templates/misc/moonflower_ThreePIE.py b/templates/misc/moonflower_ThreePIE.py
new file mode 100644
index 000000000..ce63706bf
--- /dev/null
+++ b/templates/misc/moonflower_ThreePIE.py
@@ -0,0 +1,65 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.custom import ePIE_multislice
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+
+# saving intermediate results
+p.io.autosave = u.Param(active=False)
+
+# opens plotting GUI if interaction set to active)
+p.io.autoplot = u.Param(active=True)
+p.io.interaction = u.Param(active=True)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'GradFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ePIE_multislice'
+p.engines.engine00.numiter = 200
+p.engines.engine00.probe_center_tol = None
+p.engines.engine00.compute_log_likelihood = True
+p.engines.engine00.object_norm_is_global = True
+p.engines.engine00.alpha = 1
+p.engines.engine00.beta = 1
+p.engines.engine00.probe_update_start = 0
+p.engines.engine00.number_of_slices = 2
+p.engines.engine00.slice_thickness = 60e-9
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
+

From b2cf5d8b901c69af86453dbe073533d3d88aeaf3 Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Fri, 15 Dec 2023 16:17:39 +0000
Subject: [PATCH 23/37] Reset parallel.loadmanager after a test to ensure same
 subdividing of data afterwards (#517)

---
 test/utils.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/utils.py b/test/utils.py
index 7380035ef..5e184faa6 100644
--- a/test/utils.py
+++ b/test/utils.py
@@ -16,6 +16,7 @@
 import numpy as np
 from ptypy import utils as u
 from ptypy.core import Ptycho
+from ptypy.utils import parallel
 
 
 def get_test_data_path(name):
@@ -76,6 +77,11 @@ def EngineTestRunner(engine_params,propagator='farfield',output_path='./', outpu
     if init_correct_probe:
         P.probe.S['SMFG00'].data[0] = P.model.scans['MF'].ptyscan.pr
     P.run()
+
+    # important for subdividing data, ensure a fresh start if a test will be
+    # run afterwards
+    parallel.loadmanager.reset()
+
     return P
 
 
@@ -164,4 +170,9 @@ def EngineTestRunner2(engine_params,propagator='farfield',output_path='./', outp
     p.engines.engine00 = engine_params
     P = Ptycho(p, level=4)
     P.run()
+
+    # important for subdividing data, ensure a fresh start if a test will be
+    # run afterwards
+    parallel.loadmanager.reset()
+
     return P

From 82ff2b22ed8cd2796092cb822cad61db4262ccb4 Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Fri, 15 Dec 2023 16:22:58 +0000
Subject: [PATCH 24/37] Cast mask to float32 to avoid precision issue  (#515)

* Cast mask from bool to float32 to avoid upcasting to float64

* Apply the same cast to legacy Fourier update for completeness
---
 ptypy/engines/utils.py                 | 29 +++++++++++++-------------
 test/engine_tests/engine_utils_test.py |  1 -
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/ptypy/engines/utils.py b/ptypy/engines/utils.py
index dd580d4e8..43f67fe48 100644
--- a/ptypy/engines/utils.py
+++ b/ptypy/engines/utils.py
@@ -20,7 +20,7 @@ def dynamic_load(path, baselist, fail_silently = True):
 
     :param path: Path to Python files.
     """
-    
+
     import os
     import glob
     import re
@@ -28,40 +28,40 @@ def dynamic_load(path, baselist, fail_silently = True):
 
     # Loop through paths
     engine_path = {}
-    
+
     try:
         # Complete directory path
         directory = os.path.abspath(os.path.expanduser(path))
-    
+
         if not os.path.exists(directory):
             # Continue silently
             raise IOError('Engine path %s does not exist.'
                            % str(directory))
-    
+
         # Get list of python files
         py_files = glob.glob(directory + '/*.py')
         if not py_files:
-            raise IOError('Directory %s does not contain Python files,' 
+            raise IOError('Directory %s does not contain Python files,'
                                             % str(directory))
-    
+
         # Loop through files to find engines
         for filename in py_files:
             modname = os.path.splitext(os.path.split(filename)[-1])[0]
-    
+
             # Find classes
             res = re.findall(
                 r'^class (.*)\((.*)\)', open(filename, 'r').read(), re.M)
-    
+
             for classname, basename in res:
                 if (basename in baselist) and classname not in baselist:
                     # Match!
                     engine_path[classname] = (modname, filename)
                     u.logger.info("Found Engine '%s' in file '%s'"
                                   % (classname, filename))
-    
+
         # Load engines that have been found
         for classname, mf in engine_path.items():
-    
+
             # Import module
             modname, filename = mf
             print(modname, filename)
@@ -115,7 +115,7 @@ def projection_update_generalized(diff_view, a, b, c, pbound=None):
     .. math::
         x = 1 - a - b
 
-    and 
+    and
 
     .. math::
        y = 1 - c
@@ -158,8 +158,9 @@ def projection_update_generalized(diff_view, a, b, c, pbound=None):
     # Get measured data
     I = diff_view.data
 
-    # Get the mask
-    fmask = diff_view.pod.mask
+    # Get the mask (cast to the same type as diff, for precision when operating
+    # with other numerical arrays)
+    fmask = diff_view.pod.mask.astype(I.dtype)
 
     # Propagate the exit waves
     for name, pod in diff_view.pods.items():
@@ -340,7 +341,7 @@ def basic_fourier_update_LEGACY(diff_view, pbound=None, alpha=1., LL_error=True)
     I = diff_view.data
 
     # Get the mask
-    fmask = diff_view.pod.mask
+    fmask = diff_view.pod.mask.astype(I.dtype)
 
     # For log likelihood error
     if LL_error is True:
diff --git a/test/engine_tests/engine_utils_test.py b/test/engine_tests/engine_utils_test.py
index 52910b524..976009ddb 100644
--- a/test/engine_tests/engine_utils_test.py
+++ b/test/engine_tests/engine_utils_test.py
@@ -7,7 +7,6 @@
 """
 
 import unittest
-from test import utils as tu
 import numpy as np
 from ptypy import utils as u
 from ptypy.core import Ptycho

From 4e341116dc23bd3f6ede6280bde0919ae662de25 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Mon, 18 Dec 2023 09:50:58 +0000
Subject: [PATCH 25/37] Raise megapixel limit to 500 (#513)

* raise megapixel limit to 500

* update tests

* changed default to 100 and raise warning instead of error

* modified warning message
---
 ptypy/core/classes.py           | 6 +++---
 test/core_tests/classes_test.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ptypy/core/classes.py b/ptypy/core/classes.py
index c55ee8bbe..622077a48 100644
--- a/ptypy/core/classes.py
+++ b/ptypy/core/classes.py
@@ -81,7 +81,7 @@
 
 # Hard-coded limit in array size
 # TODO: make this dynamic from available memory.
-MEGAPIXEL_LIMIT = 50
+MEGAPIXEL_LIMIT = 100
 
 
 class Base(object):
@@ -709,8 +709,8 @@ def reformat(self, newID=None, update=True):
 
             megapixels = np.array(new_shape).astype(float).prod() / 1e6
             if megapixels > MEGAPIXEL_LIMIT:
-                raise RuntimeError('Arrays larger than %dM not supported. You '
-                                   'requested %.2fM pixels.' % (MEGAPIXEL_LIMIT, megapixels))
+                logger.warning('Arrays larger than %dM not recommended. You '
+                               'requested %.2fM pixels.' % (MEGAPIXEL_LIMIT, megapixels))
 
             # Apply Nd misfit
             if self.data is not None:
diff --git a/test/core_tests/classes_test.py b/test/core_tests/classes_test.py
index b54004fa1..eee73bc01 100644
--- a/test/core_tests/classes_test.py
+++ b/test/core_tests/classes_test.py
@@ -98,7 +98,7 @@ def test_prefixes(self):
         self.assertEqual(c.GEO_PREFIX, 'G',
                          'Default prefix changed.')
 
-        self.assertEqual(c.MEGAPIXEL_LIMIT, 50,
+        self.assertEqual(c.MEGAPIXEL_LIMIT, 100,
                          'Default MEGAPIXEL_LIMIT changed.')
 
 

From c78a84df67c813fea0275d71c0f787b85741f192 Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Mon, 29 Jan 2024 15:10:57 +0000
Subject: [PATCH 26/37] Retain context and device memory pool among pycuda
 engines (#520)

* Use pycuda's autoprimaryctx to set up context including clean-up

* Change get_context's call to adapt the new function

* Remove context pop/detach in engine_finalize to retain context

* Clean up context before raising exception for clean message

* Remove unnecessary new_context flag as always the primary context is returned

* Add a function to handle device memory pool creation to ensure a single instance of it

* Create device memory pool through the central function

* Modify the excepthook for pycuda with proper device memory pool clean-up
---
 ptypy/accelerate/cuda_pycuda/__init__.py      | 76 +++++++++++++------
 .../cuda_pycuda/engines/ML_pycuda.py          | 29 +++----
 .../engines/projectional_pycuda.py            |  7 +-
 .../engines/projectional_pycuda_stream.py     | 10 +--
 .../cuda_pycuda/engines/stochastic.py         |  5 +-
 5 files changed, 72 insertions(+), 55 deletions(-)

diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py
index 3ce6a7a6e..92267c775 100644
--- a/ptypy/accelerate/cuda_pycuda/__init__.py
+++ b/ptypy/accelerate/cuda_pycuda/__init__.py
@@ -1,7 +1,13 @@
+import sys
+import os
+
+import numpy as np
 import pycuda.driver as cuda
 from pycuda.compiler import SourceModule
-import numpy as np
-import os
+from pycuda.tools import DeviceMemoryPool
+
+from ptypy.utils import parallel
+
 kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common'))
 debug_options = ['-O3', '-DNDEBUG', '-lineinfo', '-I' + kernel_dir] # release mode flags
 
@@ -11,36 +17,61 @@
 else:
     debug_options += ['-std=c++11']
 
-context = None
+# ensure pycuda's make_default_context picks up the correct GPU for that rank
+os.environ['CUDA_DEVICE'] = str(parallel.rank_local)
+
 queue = None
+dev_pool = None
+
+
+def _pycuda_excepthook(type, value, tb):
+    global dev_pool
+
+    # memory pool clean-up, avoid memory leak in the case of raising exception
+    if dev_pool is not None:
+        # only do the clean-up if it is present
+        dev_pool.stop_holding()
 
-def get_context(new_context=False, new_queue=False):
+    # raise the original exception
+    sys.__excepthook__(type, value, tb)
+sys.excepthook = _pycuda_excepthook
 
-    from ptypy.utils import parallel
 
-    global context
+def get_context(new_queue=False):
+
     global queue
 
-    if context is None or new_context:
-        cuda.init()
-        if parallel.rank_local >= cuda.Device.count():
-            raise Exception('Local rank must be smaller than total device count, \
-                rank={}, rank_local={}, device_count={}'.format(
-                parallel.rank, parallel.rank_local, cuda.Device.count()
-            ))
-        context = cuda.Device(parallel.rank_local).make_context()
-        context.push()
-        # print("made context %s on rank %s" % (str(context), str(parallel.rank)))
-        # print("The cuda device count on %s is:%s" % (str(parallel.rank),
-        #                                              str(cuda.Device.count())))
-        # print("parallel.rank:%s, parallel.rank_local:%s" % (str(parallel.rank),
-        #                                                     str(parallel.rank_local)))
+    # idempotent anyway
+    cuda.init()
+
+    if parallel.rank_local >= cuda.Device.count():
+        raise Exception('Local rank must be smaller than total device count, \
+            rank={}, rank_local={}, device_count={}'.format(
+            parallel.rank, parallel.rank_local, cuda.Device.count()
+        ))
+
+    # the existing context will always be the primary context, unless
+    # explicitly created elsewhere
+    if (context := cuda.Context.get_current()) is None:
+        from pycuda import autoprimaryctx
+        context = autoprimaryctx.context
+
     if queue is None or new_queue:
         queue = cuda.Stream()
-    
+
     return context, queue
 
 
+def get_dev_pool():
+    global dev_pool
+
+    # retain a single global instance of device memory pool
+    if dev_pool is None:
+        dev_pool = DeviceMemoryPool()
+
+    return dev_pool
+
+
 def load_kernel(name, subs={}, file=None):
 
     if file is None:
@@ -59,9 +90,8 @@ def load_kernel(name, subs={}, file=None):
     escaped = fn.replace("\\", "\\\\")
     kernel = '#line 1 "{}"\n'.format(escaped) + kernel
     mod = SourceModule(kernel, include_dirs=[np.get_include()], no_extern_c=True, options=debug_options)
-    
+
     if isinstance(name, str):
         return mod.get_function(name)
     else:  # tuple
         return tuple(mod.get_function(n) for n in name)
-
diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
index 7e1320357..9799e4a5c 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
@@ -15,14 +15,13 @@
 from pycuda import gpuarray
 import pycuda.driver as cuda
 import pycuda.cumath
-from pycuda.tools import DeviceMemoryPool
 
 from ptypy.engines import register
 from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial
 from ptypy import utils as u
 from ptypy.utils.verbose import logger, log
 from ptypy.utils import parallel
-from .. import get_context
+from .. import get_context, get_dev_pool
 from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel
 from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
 from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel
@@ -79,13 +78,13 @@ def engine_initialize(self):
         """
         Prepare for ML reconstruction.
         """
-        self.context, self.queue = get_context(new_context=True, new_queue=True)
+        self.context, self.queue = get_context(new_queue=True)
 
         if self.p.use_cuda_device_memory_pool:
-            self._dmp = DeviceMemoryPool()
-            self.allocate = self._dmp.allocate
+            self._dev_pool = get_dev_pool()
+            self.allocate = self._dev_pool.allocate
         else:
-            self._dmp = None
+            self._dev_pool = None
             self.allocate = cuda.mem_alloc
 
         self.qu_htod = cuda.Stream()
@@ -163,8 +162,6 @@ def _setup_kernels(self):
         fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
         if not fit:
             log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
-            self.context.pop()
-            self.context.detach()
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically
@@ -301,7 +298,7 @@ def engine_iterate(self, num=1):
         return err
 
     def position_update(self):
-        """ 
+        """
         Position refinement
         """
         if not self.do_position_refinement or (not self.curiter):
@@ -342,7 +339,7 @@ def position_update(self):
                 max_oby = ob.shape[-2] - aux.shape[-2] - 1
                 max_obx = ob.shape[-1] - aux.shape[-1] - 1
 
-                # We need to re-calculate the current error 
+                # We need to re-calculate the current error
                 PCK.build_aux(aux, addr, ob, pr)
                 PROP.fw(aux, aux)
                 PCK.queue.wait_for_event(ev)
@@ -351,9 +348,9 @@ def position_update(self):
                 cuda.memcpy_dtod(dest=error_state.ptr,
                                     src=err_phot.ptr,
                                     size=err_phot.nbytes)
-                
+
                 PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
-                                
+
                 log(4, 'Position refinement trial: iteration %s' % (self.curiter))
                 for i in range(PCK.mangler.nshifts):
                     PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
@@ -422,8 +419,6 @@ def engine_finalize(self):
 
 
         #self.queue.synchronize()
-        self.context.pop()
-        self.context.detach()
         super().engine_finalize()
 
 class GaussianModel(BaseModelSerial):
@@ -683,11 +678,11 @@ def __init__(self, amplitude, axes=[-2, -1], queue=None, allocator=None):
         self.DELK_f = DerivativesKernel(np.float32, queue=queue)
 
         if allocator is None:
-            self._dmp = DeviceMemoryPool()
-            self.allocator=self._dmp.allocate
+            self._dev_pool = get_dev_pool()
+            self.allocator=self._dev_pool.allocate
         else:
             self.allocator = allocator
-            self._dmp= None
+            self._dev_pool= None
 
         empty = lambda x: gpuarray.empty(x.shape, x.dtype, allocator=self.allocator)
 
diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
index c02445265..5093d6422 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
@@ -69,9 +69,9 @@ def engine_initialize(self):
         Prepare for reconstruction.
         """
         # Context, Multi GPU communicator and Stream (needs to be in this order)
-        self.context, self.queue = get_context(new_context=True, new_queue=False)
+        self.context, self.queue = get_context(new_queue=False)
         self.multigpu = get_multi_gpu_communicator()
-        self.context, self.queue = get_context(new_context=False, new_queue=True)
+        self.context, self.queue = get_context(new_queue=True)
 
         # Gaussian Smoothing Kernel
         self.GSK = GaussianSmoothingKernel(queue=self.queue)
@@ -555,9 +555,6 @@ def engine_finalize(self):
         for name, s in self.pr.S.items():
             s.data = np.copy(s.data)
 
-        self.context.pop()
-        self.context.detach()
-
         # we don't need the  "benchmarking" in DM_serial
         super().engine_finalize(benchmark=False)
 
diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py
index e9204f80b..193042895 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py
@@ -64,8 +64,6 @@ def _setup_kernels(self):
         fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
         if not fit:
             log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
-            self.context.pop()
-            self.context.detach()
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically
@@ -138,7 +136,7 @@ def engine_prepare(self):
             self.ex_data.add_data_block()
             self.ma_data.add_data_block()
             self.mag_data.add_data_block()
-        
+
     def engine_iterate(self, num=1):
         """
         Compute one iteration.
@@ -148,7 +146,7 @@ def engine_iterate(self, num=1):
         atomics_probe = self.p.probe_update_cuda_atomics
         atomics_object = self.p.object_update_cuda_atomics
         use_tiles = (not atomics_object) or (not atomics_probe)
-        
+
         for it in range(num):
 
             error = {}
@@ -311,7 +309,7 @@ def engine_iterate(self, num=1):
                 # Update positions
                 if do_update_pos:
                     """
-                    Iterates through all positions and refines them by a given algorithm. 
+                    Iterates through all positions and refines them by a given algorithm.
                     """
                     log(4, "----------- START POS REF -------------")
                     for dID in self.di.S.keys():
@@ -347,7 +345,7 @@ def engine_iterate(self, num=1):
                         # wait for data to arrive
                         self.queue.wait_for_event(ev_mag)
 
-                        # We need to re-calculate the current error 
+                        # We need to re-calculate the current error
                         if self.p.position_refinement.metric == "fourier":
                             PCK.fourier_error(aux, addr, mag, ma, ma_sum)
                             PCK.error_reduce(addr, err_fourier)
diff --git a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py
index a3363e161..881cb33a2 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py
@@ -68,7 +68,7 @@ def engine_initialize(self):
         """
         Prepare for reconstruction.
         """
-        self.context, self.queue = get_context(new_context=True, new_queue=True)
+        self.context, self.queue = get_context(new_queue=True)
 
         # initialise kernels for centring probe if required
         if self.p.probe_center_tol is not None:
@@ -160,8 +160,6 @@ def _setup_kernels(self):
         fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
         if not fit:
             log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
-            self.context.pop()
-            self.context.detach()
             raise SystemExit("ptypy has been exited.")
 
         # TODO grow blocks dynamically
@@ -484,7 +482,6 @@ def engine_finalize(self):
         for name, s in self.ob.S.items():
             s.data = np.copy(s.data)
 
-        self.context.detach()
         super().engine_finalize()
 
 

From 2b61f942cd79c37518ec4ce9bde97e4cad07c17f Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 1 Feb 2024 12:21:10 +0000
Subject: [PATCH 27/37] use latest version of checkout and setup-python (#529)

* use latest version of checkout and setup-python
* updating conda seems to break things
---
 .github/workflows/test.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4ec05b6b6..98be46bef 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -28,16 +28,15 @@ jobs:
     name: Testing with Python ${{ matrix.python-version }} 
     steps:
     - name: Checkout
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
     - name: Set up Python ${{ matrix.python-version }}
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version:  ${{ matrix.python-version }}
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
         echo $CONDA/bin >> $GITHUB_PATH
-        conda update -n base conda
         conda --version
     - name: Install dependencies
       run: |

From cc929c52dbbdbed9bfcdc3ee1eeb77695af75106 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 1 Feb 2024 13:36:34 +0000
Subject: [PATCH 28/37] Modifications to the Diamond SWMRLoader (#528)

* Fixed swmr loader for arb and mapped raster scans
---
 ptypy/experiment/swmr_loader.py | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/ptypy/experiment/swmr_loader.py b/ptypy/experiment/swmr_loader.py
index e82417ecc..69300b72e 100644
--- a/ptypy/experiment/swmr_loader.py
+++ b/ptypy/experiment/swmr_loader.py
@@ -8,6 +8,7 @@
     :license: see LICENSE for details.
 """
 import h5py as h5
+import numpy as np
 
 from ptypy.experiment import register
 from ptypy.experiment.hdf5_loader import Hdf5Loader
@@ -58,6 +59,19 @@ class SwmrLoader(Hdf5Loader):
           They are zero at the scan start, but non-zero when the position
           is complete.
 
+    [positions.fast_key_with_expected_shape]
+    default = None
+    type = str
+    help = Key for fast axis inside the positions file with expected shape
+    doc = The shape of this key entry is used to estimate the expected 
+          scan trajectory mapping and the total nr. of expected frames. 
+
+    [positions.slow_key_with_expected_shape]
+    default = None
+    type = str
+    help = Key for slow axis inside the positions file with expected shape
+    doc = The shape of this key entry is used to estimate the expected 
+          scan trajectory mapping and the total nr. of expected frames. 
     """
     def __init__(self, *args, **kwargs):
         super().__init__(*args, swmr=True, **kwargs)
@@ -78,6 +92,15 @@ def _params_check(self):
         
     def _prepare_intensity_and_positions(self):
         super()._prepare_intensity_and_positions()
+        self.positions_slow_shape = self.fhandle_positions_slow[self.p.positions.slow_key_with_expected_shape].shape
+        self.positions_fast_shape = self.fhandle_positions_slow[self.p.positions.fast_key_with_expected_shape].shape
+        if len(self.data_shape[:-2]) == 2:
+            self.data_shape = self.positions_slow_shape + self.positions_fast_shape + tuple(np.array(self.data_shape)[-2:])
+        elif len(self.data_shape[:-2]) == 1:
+            self.data_shape = (self.positions_slow_shape[0],) + tuple(np.array(self.data_shape)[-2:])
+        print("self.data_shape", self.data_shape)
+        print("self.positions_slow_shape", self.positions_slow_shape)
+        print("self.positions_fast_shape", self.positions_fast_shape)
         self.kf = KeyFollower((self.fhandle_intensities[self.p.intensities.live_key],
                                self.fhandle_positions_slow[self.p.positions.live_slow_key],
                                self.fhandle_positions_fast[self.p.positions.live_fast_key]),
@@ -85,13 +108,16 @@ def _prepare_intensity_and_positions(self):
         
     def compute_scan_mapping_and_trajectory(self,*args):
         super().compute_scan_mapping_and_trajectory(*args)
-        assert isinstance(self.slow_axis, h5.Dataset), "Scantype = {:s} and mapped={:} is not compatible with the SwmrLoader".format(self._scantype, self._ismapped)
+        #assert isinstance(self.slow_axis, h5.Dataset), "Scantype = {:s} and mapped={:} is not compatible with the SwmrLoader".format(self._scantype, self._ismapped)
 
     def get_data_chunk(self, *args, **kwargs):
         self.kf.refresh()
         self.intensities.refresh()
-        self.slow_axis.refresh()
-        self.fast_axis.refresh()
+        try:
+            self.slow_axis.refresh()
+            self.fast_axis.refresh()
+        except AttributeError:
+            print("Can't refresh position keys")
         # refreshing here to update before Ptyscan.get_data_chunk calls check and load
         return super().get_data_chunk(*args, **kwargs)
 

From f8f9750125c1da0cb9241155b4c467620e5e089d Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 1 Feb 2024 13:37:27 +0000
Subject: [PATCH 29/37] streaming loader for diamond (#502)

---
 ptypy/experiment/diamond_streaming.py | 199 ++++++++++++++++++++++++++
 1 file changed, 199 insertions(+)
 create mode 100644 ptypy/experiment/diamond_streaming.py

diff --git a/ptypy/experiment/diamond_streaming.py b/ptypy/experiment/diamond_streaming.py
new file mode 100644
index 000000000..65bc5e2ff
--- /dev/null
+++ b/ptypy/experiment/diamond_streaming.py
@@ -0,0 +1,199 @@
+"""\
+Description here
+
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+
+import logging
+import zmq
+import time
+import json
+import pickle
+import numpy as np
+from ptypy import utils as u
+from ptypy.core.data import PtyScan
+from ptypy.experiment import register
+from ptypy.utils.verbose import log
+
+@register()
+class DiamondZMQLoader(PtyScan):
+    """
+
+    Defaults:
+
+    [name]
+    default = 'DiamondZMQLoader'
+    type = str
+    help =
+
+    [metadata]
+    default = 'tcp://127.0.0.1:5553'
+    type = str
+    help = Address for metadata socket
+
+    [datastream]
+    default = 'tcp://127.0.0.1:5552'
+    type = str
+    help = Address for datastream socket
+
+    [chunksize]
+    default = 50
+    type = int
+    help = Nr. of frames (chunks) to be pulled at once from the socket
+
+    [logfile]
+    default = /tmp/ptypy_streaming_log.json
+    type = str
+    help = A JSON file for time logging
+    """
+
+    def __init__(self, pars=None, **kwargs):
+        self.p = self.DEFAULT.copy(99)
+        self.p.update(pars, in_place_depth=99)
+        super().__init__(self.p, **kwargs)
+
+        # ZMQ Context
+        self.context = zmq.Context()
+        
+        # Create socket to request some information and ask for metadata
+        self.metadata_socket = self.context.socket(zmq.REQ) 
+        self.metadata_socket.connect(self.p.metadata)
+        
+        # Socket to pull main data
+        self.datastream_socket = self.context.socket(zmq.PULL)
+        self.datastream_socket.connect(self.p.datastream)
+        self.connected = True
+
+        # Meta information
+        self.metadata_socket.send(b"Start")
+        log(4, 'Waiting for metadata...')
+        self.metadata = pickle.loads(self.metadata_socket.recv())
+        log(4, "Metadata recieved")
+        
+        # Setting meta/info parameters
+        self.data_dtype = self.metadata["dtype"]
+        self.data_shape = self.metadata["shape"]
+        self.frame_shape = self.data_shape[1:]
+        self.num_frames = self.data_shape[0]
+        self.p.shape = self.frame_shape
+        self.info.shape = self.p.shape
+        self.info.center = None
+        self.info.auto_center = self.p.auto_center
+        self.meta.energy  = self.p.energy
+        self.meta.distance = self.p.distance
+        self.info.psize = self.p.psize
+
+        # Create empty memory
+        self._data = np.empty(shape=self.data_shape, dtype=self.data_dtype)
+        self._pos = np.empty(shape=(self.data_shape[0], 2), dtype=float)
+        self.framecount = 0
+
+        # Logging
+        self.log = {}
+        self.log["start"] = time.time()
+        
+    def fetch(self,chunksize=1):
+        if not self.connected:
+            return
+        # Fetch data from socket
+        for i in range(chunksize):
+            databuf, posxbuf, posybuf = self.datastream_socket.recv_multipart()
+            self._data[self.framecount] = np.frombuffer(databuf,dtype=self.data_dtype).reshape(self.frame_shape)
+            self._pos[self.framecount] = np.array([float(posybuf.decode()), float(posxbuf.decode())])
+            self.framecount += 1
+
+    def check(self, frames=None, start=None):
+        """
+        Check how many frames are available.
+
+        Parameters
+        ----------
+        frames : int or None
+            Number of frames requested.
+        start : int or None
+            Scanpoint index to start checking from.
+
+        Returns
+        -------
+        frames_accessible : int
+            Number of frames readable.
+
+        end_of_scan : int or None
+            is one of the following,
+            - 0, end of the scan is not reached
+            - 1, end of scan will be reached or is
+            - None, can't say
+        """
+
+        if start is None:
+            start = self.framestart
+        
+        if frames is None:
+            frames = self.min_frames
+
+        if (self.num_frames - self.framecount) > self.p.chunksize:
+            chunksize = self.p.chunksize
+        else:
+            chunksize = 1
+        self.fetch(chunksize)
+            
+        # Check how many frames are available
+        available = self.framecount
+        new_frames = available - start        
+        # not reached expected nr. of frames
+        if new_frames <= frames:
+            # but its last chunk of scan so load it anyway
+            if available == self.num_frames:
+                frames_accessible = new_frames    
+                end_of_scan = 1
+                if self.connected:
+                    self.finish()
+                    # end all ZMQ communications
+                    self.context.destroy()
+                    self.connected = False                    
+            # otherwise, do nothing
+            else:
+                end_of_scan = 0
+                frames_accessible = 0
+        # reached expected nr. of frames
+        else:
+            end_of_scan = 0
+            frames_accessible = frames
+        #log(3, f"frames = {frames}, start = {start}, available = {available}, frames_accessible = {frames_accessible}, end_of_scan = {end_of_scan}, new_frames = {new_frames}, num_frames = {self.num_frames}")
+
+        return frames_accessible, end_of_scan
+                    
+    def load(self, indices):
+        """
+        return data
+
+        Returns
+        -------
+        raw, positions, weight : dict
+            Dictionaries whose keys are the given scan point `indices`
+            and whose values are the respective frame / position according
+            to the scan point index. `weight` and `positions` may be empty
+        """
+        intensities = {}
+        positions = {}
+        weights = {}
+        log(4, "Loading...")
+        log(4, f"indices = {indices}")
+        for ind in indices:
+            intensities[ind] = self._data[ind]
+            positions[ind] = self._pos[ind]
+            weights[ind] = np.ones(len(intensities[ind]))
+            #print(f"Loaded index {ind} with pos {positions[ind]} and data {intensities[ind].sum()}")
+                        
+        return intensities, positions, weights
+
+    def finish(self):
+        with open(self.p.logfile, "w") as f:
+            self.log["stop"] = time.time()
+            json.dump(self.log,f)
+        self.metadata_socket.send(b"Stop")
+        # while True:
+        #     reply = self.metadata_socket.recv()
+        #     print("[Recons] Recevied stop reply ", reply)
+        #     break

From e3075c105a9375ce0e41a38e01ef44a5b49ec2dc Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 1 Feb 2024 13:58:23 +0000
Subject: [PATCH 30/37] add exit buffer to copied state (#530)

---
 ptypy/core/ptycho.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py
index e9b725145..18fd22c74 100644
--- a/ptypy/core/ptycho.py
+++ b/ptypy/core/ptycho.py
@@ -1119,7 +1119,7 @@ def plot_overview(self, fignum=100):
 
     def copy_state(self, name="baseline", overwrite=False):
         """
-        Store a copy of the current state of object/probe
+        Store a copy of the current state of object/probe and exit
 
         Warning: This feature is under development and syntax might change!
         """
@@ -1133,12 +1133,14 @@ def copy_state(self, name="baseline", overwrite=False):
         self.state_dict[name] = {}
         self.state_dict[name]["ob"] = self.obj.copy()
         self.state_dict[name]["pr"] = self.probe.copy()
+        self.state_dict[name]["ex"] = self.exit.copy()
         self.state_dict[name]["runtime"] = self.runtime.copy(depth=99)
         logger.info("Saved a copy of object and probe as the {:s} state".format(name))
             
     def restore_state(self, name="baseline", reformat_exit=True):
         """
         Restore object/probe based on a previously saved copy
+        The exit buffer can be reformatted or loaded from the state
 
         Warning: This feature is under development and syntax might change!
         """
@@ -1147,6 +1149,8 @@ def restore_state(self, name="baseline", reformat_exit=True):
                 S.data[:] = self.state_dict[name]["pr"].storages[ID].data
             for ID,S in self.obj.storages.items():
                 S.data[:] = self.state_dict[name]["ob"].storages[ID].data
+            for ID,S in self.exit.storages.items():
+                S.data[:] = self.state_dict[name]["ex"].storages[ID].data   
         self.runtime = self.state_dict[name]["runtime"]
         
         # Reformat/Recalculate exit waves

From be94818585b724e91c48579d0c43c131fb604a54 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Thu, 1 Feb 2024 15:59:43 +0000
Subject: [PATCH 31/37] fixed imports in threepie moonflower example (#531)

---
 templates/misc/moonflower_ThreePIE.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/templates/misc/moonflower_ThreePIE.py b/templates/misc/moonflower_ThreePIE.py
index ce63706bf..f9f940ec5 100644
--- a/templates/misc/moonflower_ThreePIE.py
+++ b/templates/misc/moonflower_ThreePIE.py
@@ -5,7 +5,7 @@
 """
 from ptypy.core import Ptycho
 from ptypy import utils as u
-from ptypy.custom import ePIE_multislice
+from ptypy.custom import threepie
 
 import tempfile
 tmpdir = tempfile.gettempdir()
@@ -48,7 +48,7 @@
 # attach a reconstrucion engine
 p.engines = u.Param()
 p.engines.engine00 = u.Param()
-p.engines.engine00.name = 'ePIE_multislice'
+p.engines.engine00.name = 'ThreePIE'
 p.engines.engine00.numiter = 200
 p.engines.engine00.probe_center_tol = None
 p.engines.engine00.compute_log_likelihood = True

From 2fa9f5a7868c83a0de0f70e23e95b365cb375bee Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 2 Feb 2024 16:10:28 +0000
Subject: [PATCH 32/37] remove NCCL capability from pycuda, use cupy engines
 instead (#524)

---
 ptypy/accelerate/cuda_pycuda/multi_gpu.py     | 72 +------------------
 .../cuda_pycuda_tests/multi_gpu_test.py       |  4 --
 2 files changed, 3 insertions(+), 73 deletions(-)

diff --git a/ptypy/accelerate/cuda_pycuda/multi_gpu.py b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
index 33113c273..73138c3ee 100644
--- a/ptypy/accelerate/cuda_pycuda/multi_gpu.py
+++ b/ptypy/accelerate/cuda_pycuda/multi_gpu.py
@@ -25,6 +25,8 @@
 4) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. 
    It should be in DEFAULT mode.
 
+5) NCCL support has been dropped from PyCUDA module, but can be used with CuPy module instead
+
 """
 
 from pkg_resources import parse_version
@@ -35,12 +37,6 @@
 from ptypy.utils.verbose import logger, log
 import os
 
-try:
-    from cupy.cuda import nccl
-    import cupy as cp
-except ImportError:
-    nccl = None
-
 try:
     import mpi4py
 except ImportError:
@@ -48,13 +44,6 @@
 
 # properties to check which versions are available
 
-# use NCCL is it is available, and the user didn't override the
-# default selection with environment variables
-have_nccl = (nccl is not None) and \
-    (not 'PTYPY_USE_CUDAMPI' in os.environ) and \
-    (not 'PTYPY_USE_MPI' in os.environ) and \
-    ('PTYPY_USE_NCCL' in os.environ)
-
 # At the moment, we require:
 # the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true,
 # mpi4py >= 3.1.0
@@ -109,64 +98,9 @@ def allReduceSum(self, arr):
             comm = parallel.comm
             comm.Allreduce(parallel.MPI.IN_PLACE, arr)
             
-    
-class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase):
-    
-    def __init__(self):
-        super().__init__()
-
-        # Check if GPUs are in default mode        
-        if cuda.Context.get_device().get_attributes()[cuda.device_attribute.COMPUTE_MODE] != cuda.compute_mode.DEFAULT:
-            raise RuntimeError("Compute mode must be default in order to use NCCL")
-        
-        # get a unique identifier for the NCCL communicator and 
-        # broadcast it to all MPI processes (assuming one device per process)
-        if self.rank == 0:
-            self.id = nccl.get_unique_id()
-        else:
-            self.id = None
-
-        self.id = parallel.bcast(self.id)
-
-        self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank)
-
-    def allReduceSum(self, arr):
-        """Call MPI.all_reduce in-place, with array on GPU"""
-
-        buf = int(arr.gpudata)
-        count, datatype = self.__get_NCCL_count_dtype(arr)
-        
-        # no stream support here for now - it fails in NCCL when 
-        # pycuda.Stream.handle is used for some unexplained reason
-        stream = cp.cuda.Stream.null.ptr
-       
-        self.com.allReduce(buf, buf, count, datatype, nccl.NCCL_SUM, stream)
-
-    def __get_NCCL_count_dtype(self, arr):
-            if arr.dtype == np.complex64:
-                return arr.size*2, nccl.NCCL_FLOAT32
-            elif arr.dtype == np.complex128:
-                return arr.size*2, nccl.NCCL_FLOAT64
-            elif arr.dtype == np.float32:
-                return arr.size, nccl.NCCL_FLOAT32
-            elif arr.dtype == np.float64:
-                return arr.size, nccl.NCCL_FLOAT64
-            else:
-                raise ValueError("This dtype is not supported by NCCL.")
-
 
 # pick the appropriate communicator depending on installed packages
-def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True):
-    if have_nccl and use_nccl:
-        try:
-            comm = MultiGpuCommunicatorNccl()
-            log(4, "Using NCCL communicator")
-            return comm
-        except RuntimeError:
-            pass
-        except AttributeError:
-            # see issue #323
-            pass
+def get_multi_gpu_communicator(use_cuda_mpi=True):
     if have_cuda_mpi and use_cuda_mpi:
         try:
             comm = MultiGpuCommunicatorCudaMpi()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
index fdc34a528..be96aed54 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py
@@ -85,7 +85,3 @@ def test_multigpu_mpi(self):
     @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available")
     def test_multigpu_cudampi(self):
         self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi())
-
-    @unittest.skipIf(not mgpu.have_nccl, "NCCL not available")
-    def test_multigpu_nccl(self):
-        self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl())

From cb888b750ddaed139366451e124ee1c05a9813ad Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Fri, 2 Feb 2024 16:13:53 +0000
Subject: [PATCH 33/37] Add WASP reconstruction engine (#522)

* Add sign function to return z/|z| for complex number

* Naive implementation of RASP

* Rewrite the RASP algorithm

[WIP] Fill bottom object sum with mean power

Rewrite the RASP algorithm

* Remove denominator fill-up (make recon a bit unstable)

* Support MPI in RASP

* Skip calculating probe max power (already present)

* Delete helper containers as in the parent

Remove containers

* Implement RASP serial version

[WIP] Looks like a working version

Change base RASP inheritance to PositionCorrectEngine to reduce unnecessary operations

[WIP] Fix RASP averaging (was not doing it at all)

[WIP] Final working RASP serial

* Add random seed (for shuffling diff) as a parameter in RASP

* Ensure only one RNG is generated and input vieworder is the same everytime

* Ensure same view order is used between RASP_serial and RASP

* Use the same epsilon with normal implementation for float-point division

* Use customised abs2 consistently when computing af

* Avoid extra computation of af to prevent loss in precision

* Use the exact formula with normal implementation to prevent FP arithmetic issue

* Renmae rasp files to WASP

* Revert "Add sign function to return z/|z| for complex number"

This reverts commit f06b5816d256969fec47af26ec096325a032c6fb.

* Revert "Avoid extra computation of af to prevent loss in precision"

This reverts commit 90457da546a025fcc67762f151ee42cb5e3d774b.

* Rename RASP to WASP

* Update WASP publication

* Add moonflower example for WASP_serial

* Remove some irrelevant parameters in WASP

* Add tests for WASP_serial

* [WIP] Draft WASP pycuda

* Divide WASP_serial ob pr update into two as in the normal version

* [WIP] WASP pycuda

* Step the ob and pr sum to the correct location for updating

* Add the WASP ob and pr averaging kernels

* Complete WASP_pycuda engine

* Match serial version of WASP to its pycuda's for averaging ob/pr

* Modify WASP examples for multiple number of probe modes

* Add WASP pycuda example

* Add WASP pycuda tests

* Remove unsed ob_abs2 in WASP pr update kernel

* Add WASP cupy kernel calls

* Draft WASP cupy engine

* Handle mulit-probe abs2 in the kernel and pycuda version

* Add tests for WASP pycuda kernels

* Fix WASP cupy ob_update kernel by making sure input is C-contiguous

* Add WASP cupy kernel tests

* Revert "Use the same epsilon with normal implementation for float-point division"

This reverts commit 2e91841d47ef26d60d35ea9f06b089923cbe1e3a.

* Revert "Use customised abs2 consistently when computing af"

This reverts commit b9c158d00e49e49b4f336717c4a13074fb9d97e5.

* Revert "Use the exact formula with normal implementation to prevent FP arithmetic issue"

This reverts commit 753f154f17b2f16e154104ab09910c5823ffd324.

* Add probe centering to WASP pycuda and cupy

* Remove custom probe power correction

* Remove tests related to custom probe power correction

* Add tests for WASP cupy

* Add WASP cupy moonflower example

* Remove unnecessary clean-up in WASP pycuda because of #520

* Combine ob/pr avg wasp kernel in serial version

* Remove ob/pr avg wasp kernels

* Combine ob/pr avg wasp kernel in pycuda version

* Combine ob/pr avg wasp kernels in cupy version

* Remove redundant ob/pr avg wasp kernel tests
---
 ptypy/accelerate/base/kernels.py              |  50 +-
 ptypy/accelerate/cuda_common/avg_wasp.cu      |  47 ++
 .../accelerate/cuda_common/ob_update_wasp.cu  |  89 +++
 .../accelerate/cuda_common/pr_update_wasp.cu  |  83 +++
 ptypy/accelerate/cuda_cupy/kernels.py         |  95 +++
 ptypy/accelerate/cuda_pycuda/kernels.py       | 139 ++++-
 ptypy/custom/WASP.py                          | 377 ++++++++++++
 ptypy/custom/WASP_cupy.py                     | 542 ++++++++++++++++++
 ptypy/custom/WASP_pycuda.py                   | 538 +++++++++++++++++
 ptypy/custom/WASP_serial.py                   | 450 +++++++++++++++
 templates/misc/moonflower_WASP.py             |  58 ++
 templates/misc/moonflower_WASP_cupy.py        |  59 ++
 templates/misc/moonflower_WASP_pycuda.py      |  59 ++
 templates/misc/moonflower_WASP_serial.py      |  58 ++
 .../accelerate_tests/base_tests/WASP_tests.py | 155 +++++
 .../cuda_cupy_tests/WASP_tests.py             | 154 +++++
 .../cuda_cupy_tests/po_update_kernel_test.py  | 244 +++++++-
 .../cuda_pycuda_tests/WASP_tests.py           | 154 +++++
 .../po_update_kernel_test.py                  | 244 +++++++-
 19 files changed, 3539 insertions(+), 56 deletions(-)
 create mode 100644 ptypy/accelerate/cuda_common/avg_wasp.cu
 create mode 100644 ptypy/accelerate/cuda_common/ob_update_wasp.cu
 create mode 100644 ptypy/accelerate/cuda_common/pr_update_wasp.cu
 create mode 100644 ptypy/custom/WASP.py
 create mode 100644 ptypy/custom/WASP_cupy.py
 create mode 100644 ptypy/custom/WASP_pycuda.py
 create mode 100644 ptypy/custom/WASP_serial.py
 create mode 100644 templates/misc/moonflower_WASP.py
 create mode 100644 templates/misc/moonflower_WASP_cupy.py
 create mode 100644 templates/misc/moonflower_WASP_pycuda.py
 create mode 100644 templates/misc/moonflower_WASP_serial.py
 create mode 100644 test/accelerate_tests/base_tests/WASP_tests.py
 create mode 100644 test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
 create mode 100644 test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py

diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py
index af1b65b11..c5343f940 100644
--- a/ptypy/accelerate/base/kernels.py
+++ b/ptypy/accelerate/base/kernels.py
@@ -62,7 +62,7 @@ def fourier_error(self, b_aux, addr, mag, mask, mask_sum):
 
         ## Actual math ##
 
-        # build model from complex fourier magnitudes, summing up 
+        # build model from complex fourier magnitudes, summing up
         # all modes incoherently
         tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
         af = np.sqrt((np.abs(tf) ** 2).sum(1))
@@ -86,7 +86,7 @@ def fourier_deviation(self, b_aux, addr, mag):
 
         ## Actual math ##
 
-        # build model from complex fourier magnitudes, summing up 
+        # build model from complex fourier magnitudes, summing up
         # all modes incoherently
         tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
         af = np.sqrt((np.abs(tf) ** 2).sum(1))
@@ -136,12 +136,12 @@ def fmag_all_update(self, b_aux, addr, mag, mask, err_sum, pbound=0.0):
 
         ## As opposed to DM we use renorm to differentiate the cases.
 
-        # pbound >= g_err_sum  
+        # pbound >= g_err_sum
         # fm = 1.0 (as renorm = 1, i.e. renorm[~ind])
         # pbound < g_err_sum :
-        # fm = (1 - g_mask) + g_mask * (g_mag + fdev * renorm) / (af + 1e-10) 
+        # fm = (1 - g_mask) + g_mask * (g_mag + fdev * renorm) / (af + 1e-10)
         # (as renorm in [0,1])
-        # pbound == 0.0 
+        # pbound == 0.0
         # fm = (1 - g_mask) + g_mask * g_mag / (af + 1e-10) (as renorm=0)
 
         ind = err_sum > pbound
@@ -192,7 +192,7 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_phot):
         # batch buffers
         aux = b_aux[:maxz * self.nmodes]
 
-        # build model from complex fourier magnitudes, summing up 
+        # build model from complex fourier magnitudes, summing up
         # all modes incoherently
         tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
         LL = (np.abs(tf) ** 2).sum(1)
@@ -516,7 +516,7 @@ def _build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1):
                   ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] + \
                   (1 - tau * (1 + alpha)) * \
                   ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] * \
-                  pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] 
+                  pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols]
 
             ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] += dex
             aux[ind, :, :] = dex
@@ -660,6 +660,40 @@ def pr_norm_local(self, addr, pr, prn):
             pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols]).real
         return
 
+    def ob_update_wasp(self, addr, ob, pr, ex, aux, ob_sum_nmr, ob_sum_dnm, alpha=1):
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            pr_conj = pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols].conj()
+            pr_abs2 = abs2(pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols])
+            deltaEW = ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind, :, :]
+
+            ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += 0.5 * pr_conj * deltaEW / (pr_abs2.mean() * alpha + pr_abs2)
+
+            ob_sum_nmr[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += pr_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols]
+            ob_sum_dnm[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols] += pr_abs2
+
+    def pr_update_wasp(self, addr, pr, ob, ex, aux, pr_sum_nmr, pr_sum_dnm, beta=1):
+        sh = addr.shape
+        flat_addr = addr.reshape(sh[0] * sh[1], sh[2], sh[3])
+        rows, cols = ex.shape[-2:]
+
+        for ind, (prc, obc, exc, mac, dic) in enumerate(flat_addr):
+            ob_conj = ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols].conj()
+            ob_abs2 = abs2(ob[obc[0], obc[1]:obc[1] + rows, obc[2]:obc[2] + cols])
+            deltaEW = ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols] - aux[ind, :, :]
+
+            pr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += ob_conj * deltaEW / (beta + ob_abs2)
+
+            pr_sum_nmr[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += ob_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols]
+            pr_sum_dnm[prc[0], prc[1]:prc[1] + rows, prc[2]:prc[2] + cols] += ob_abs2
+
+    def avg_wasp(self, arr, nmr, dnm):
+        is_zero = np.isclose(dnm, 0)
+        arr[:] = np.where(is_zero, nmr, nmr / dnm)
+
 
 class PositionCorrectionKernel(BaseKernel):
     from ptypy.accelerate.base import address_manglers
@@ -775,7 +809,7 @@ def log_likelihood(self, b_aux, addr, mag, mask, err_sum):
         # batch buffers
         aux = b_aux[:maxz * self.nmodes]
 
-        # build model from complex fourier magnitudes, summing up 
+        # build model from complex fourier magnitudes, summing up
         # all modes incoherently
         tf = aux.reshape(maxz, self.nmodes, sh[1], sh[2])
         LL = (np.abs(tf) ** 2).sum(1)
diff --git a/ptypy/accelerate/cuda_common/avg_wasp.cu b/ptypy/accelerate/cuda_common/avg_wasp.cu
new file mode 100644
index 000000000..0c2fce77a
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/avg_wasp.cu
@@ -0,0 +1,47 @@
+/** avg_wasp
+*
+* Data types:
+* - IN_TYPE: the data type for the inputs (float or double)
+* - OUT_TYPE: the data type for the outputs (float or double)
+* - MATH_TYPE: the data type used for computation
+*/
+
+#include "common.cuh"
+
+// specify max number of threads/block and min number of blocks per SM,
+// to assist the compiler in register optimisations.
+// We achieve a higher occupancy in this case, as less registers are used
+// (guided by profiler)
+extern "C" __global__ void __launch_bounds__(1024, 2)
+    avg_wasp(complex<OUT_TYPE> *arr,
+             const complex<IN_TYPE>* __restrict__ nmr,
+             const IN_TYPE* __restrict__ dnm,
+             int A,
+             int B,
+             int C
+             )
+{
+    const int bid = blockIdx.z;
+    const int tx = threadIdx.x;
+    const int b = threadIdx.y + blockIdx.y * blockDim.y;
+
+    /*go to this mode*/
+    arr += bid * B * C;
+    nmr += bid * B * C;
+    dnm += bid * B * C;
+
+    if (b >= B)
+        return;
+
+    for (int c = tx; c < C; c += blockDim.x) {
+      if (dnm[b * C + c] != 0) {
+        auto avg_val_tmp = nmr[b * C + c] / dnm[b * C + c];
+        complex<OUT_TYPE> avg_val = avg_val_tmp;
+        arr[b * C + c] = avg_val;
+      }
+      else {
+        complex<OUT_TYPE> avg_val = nmr[b * C + c];
+        arr[b * C + c] = avg_val;
+      }
+    }
+}
diff --git a/ptypy/accelerate/cuda_common/ob_update_wasp.cu b/ptypy/accelerate/cuda_common/ob_update_wasp.cu
new file mode 100644
index 000000000..7127cc287
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/ob_update_wasp.cu
@@ -0,0 +1,89 @@
+/** ob_update_wasp - in WASP algorithm.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include "common.cuh"
+
+template <class T>
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, y.real());
+  atomicAdd(xf + 1, y.imag());
+}
+
+extern "C" __global__ void ob_update_wasp(
+    const complex<IN_TYPE>* __restrict__ exit_wave,
+    const complex<IN_TYPE>* __restrict__ aux,
+    int A,
+    int B,
+    int C,
+    const complex<IN_TYPE>* __restrict__ probe,
+    const IN_TYPE* __restrict__ probe_abs2,
+    int D,
+    int E,
+    int F,
+    complex<OUT_TYPE>* obj,
+    complex<OUT_TYPE>* obj_sum_nmr,
+    OUT_TYPE* obj_sum_dnm,
+    int G,
+    int H,
+    int I,
+    const int* __restrict__ addr,
+    const IN_TYPE* __restrict__ probe_abs2_mean,
+    const IN_TYPE alpha_)
+{
+  const int bid = blockIdx.z;
+  const int tx = threadIdx.x;
+  const int b = threadIdx.y + blockIdx.y * blockDim.y;
+  if (b >= B)
+    return;
+  const int addr_stride = 15;
+
+  const int* oa = addr + 3 + bid * addr_stride;
+  const int* pa = addr + bid * addr_stride;
+  const int* ea = addr + 6 + bid * addr_stride;
+
+  probe += pa[0] * E * F + pa[1] * F + pa[2];
+  probe_abs2 += pa[0] * E * F + pa[1] * F + pa[2];
+  obj += oa[0] * H * I + oa[1] * I + oa[2];
+  obj_sum_nmr += oa[0] * H * I + oa[1] * I + oa[2];
+  obj_sum_dnm += oa[0] * H * I + oa[1] * I + oa[2];
+  aux += bid * B * C;
+  /*the abs2 mean of this probe mode*/
+  const MATH_TYPE probe_abs2_mean_val = probe_abs2_mean[pa[0]];
+  const MATH_TYPE alpha = alpha_;
+
+  assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+  exit_wave += ea[0] * B * C;
+
+  for (int c = tx; c < C; c += blockDim.x)
+  {
+      complex<MATH_TYPE> probe_val = probe[b * F + c];
+      MATH_TYPE probe_abs2_val = probe_abs2[b * F + c];
+      complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+      complex<MATH_TYPE> aux_val = aux[b * C + c];
+
+      /*(pr_abs2.mean() * alpha + pr_abs2)*/
+      MATH_TYPE norm_val = probe_abs2_mean_val * alpha + probe_abs2_val;
+
+      /*0.5 * pr_conj * deltaEW / (pr_abs2.mean() * alpha + pr_abs2)*/
+      auto add_val_0 = MATH_TYPE(0.5) * conj(probe_val) * (exit_val - aux_val) / norm_val;
+      complex<OUT_TYPE> add_val = add_val_0;
+      atomicAdd(&obj[b * I + c], add_val);
+
+      /*pr_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols]*/
+      auto add_val_1 = conj(probe_val) * exit_val;
+      complex<OUT_TYPE> add_val_nmr = add_val_1;
+      atomicAdd(&obj_sum_nmr[b * I + c], add_val_nmr);
+
+      /*pr_abs2*/
+      OUT_TYPE add_val_dnm = probe_abs2_val;
+      atomicAdd(&obj_sum_dnm[b * I + c], add_val_dnm);
+  }
+}
diff --git a/ptypy/accelerate/cuda_common/pr_update_wasp.cu b/ptypy/accelerate/cuda_common/pr_update_wasp.cu
new file mode 100644
index 000000000..04c2a08e5
--- /dev/null
+++ b/ptypy/accelerate/cuda_common/pr_update_wasp.cu
@@ -0,0 +1,83 @@
+/** pr_update_wasp - in WASP algorithm.
+ *
+ * Data types:
+ * - IN_TYPE: the data type for the inputs (float or double)
+ * - OUT_TYPE: the data type for the outputs (float or double)
+ * - MATH_TYPE: the data type used for computation
+ */
+
+#include "common.cuh"
+
+template <class T>
+__device__ inline void atomicAdd(complex<T>* x, const complex<T>& y)
+{
+  auto xf = reinterpret_cast<T*>(x);
+  atomicAdd(xf, y.real());
+  atomicAdd(xf + 1, y.imag());
+}
+
+extern "C" __global__ void pr_update_wasp(
+    const complex<IN_TYPE>* __restrict__ exit_wave,
+    const complex<IN_TYPE>* __restrict__ aux,
+    int A,
+    int B,
+    int C,
+    complex<OUT_TYPE>* probe,
+    complex<OUT_TYPE>* probe_sum_nmr,
+    OUT_TYPE* probe_sum_dnm,
+    int D,
+    int E,
+    int F,
+    const complex<IN_TYPE>* __restrict__ obj,
+    int G,
+    int H,
+    int I,
+    const int* __restrict__ addr,
+    const IN_TYPE beta_)
+{
+  assert(B == E);  // prsh[1]
+  assert(C == F);  // prsh[2]
+  const int bid = blockIdx.z;
+  const int tx = threadIdx.x;
+  const int b = threadIdx.y + blockIdx.y * blockDim.y;
+  if (b >= B)
+    return;
+  const int addr_stride = 15;
+
+  const int* oa = addr + 3 + bid * addr_stride;
+  const int* pa = addr + bid * addr_stride;
+  const int* ea = addr + 6 + bid * addr_stride;
+
+  probe += pa[0] * E * F + pa[1] * F + pa[2];
+  probe_sum_nmr += pa[0] * E * F + pa[1] * F + pa[2];
+  probe_sum_dnm += pa[0] * E * F + pa[1] * F + pa[2];
+  obj += oa[0] * H * I + oa[1] * I + oa[2];
+  aux += bid * B * C;
+  const MATH_TYPE beta = beta_;
+
+  assert(oa[0] * H * I + oa[1] * I + oa[2] + (B - 1) * I + C - 1 < G * H * I);
+
+  exit_wave += ea[0] * B * C;
+
+  for (int c = tx; c < C; c += blockDim.x)
+  {
+      complex<MATH_TYPE> obj_val = obj[b * I + c];
+      MATH_TYPE obj_abs2_val = obj_val.real() * obj_val.real() + obj_val.imag() * obj_val.imag();
+      complex<MATH_TYPE> exit_val = exit_wave[b * C + c];
+      complex<MATH_TYPE> aux_val = aux[b * C + c];
+
+      /*ob_conj * deltaEW / (beta + ob_abs2)*/
+      auto add_val_0 = conj(obj_val) * (exit_val - aux_val) / (beta + obj_abs2_val);
+      complex<OUT_TYPE> add_val = add_val_0;
+      atomicAdd(&probe[b * F + c], add_val);
+
+      /*ob_conj * ex[exc[0], exc[1]:exc[1] + rows, exc[2]:exc[2] + cols]*/
+      auto add_val_1 = conj(obj_val) * exit_val;
+      complex<OUT_TYPE> add_val_nmr = add_val_1;
+      atomicAdd(&probe_sum_nmr[b * F + c], add_val_nmr);
+
+      /*ob_abs2*/
+      OUT_TYPE add_val_dnm = obj_abs2_val;
+      atomicAdd(&probe_sum_dnm[b * F + c], add_val_dnm);
+  }
+}
diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py
index 53c012076..6d4de55dd 100644
--- a/ptypy/accelerate/cuda_cupy/kernels.py
+++ b/ptypy/accelerate/cuda_cupy/kernels.py
@@ -903,6 +903,24 @@ def __init__(self, queue_thread=None,
             'MATH_TYPE': self.math_type,
             'ACC_TYPE': self.accumulator_type
         })
+        self.ob_update_wasp_cuda = load_kernel("ob_update_wasp", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.pr_update_wasp_cuda = load_kernel("pr_update_wasp", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.avg_wasp_cuda = load_kernel("avg_wasp", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
 
     def ob_update(self, addr, ob, obn, pr, ex, atomics=True):
         obsh = [np.int32(ax) for ax in ob.shape]
@@ -1181,6 +1199,83 @@ def pr_norm_local(self, addr, pr, prn):
                   prsh[0], prsh[1], prsh[2],
                   addr))
 
+    def ob_update_wasp(self, addr, ob, pr, ex, aux, ob_sum_nmr, ob_sum_dnm,
+                       alpha=1):
+        # ensure it is C-contiguous!
+        pr_abs2 = cp.ascontiguousarray((pr * pr.conj()).real)
+        pr_abs2_mean = cp.mean(pr_abs2, axis=(1,2))
+
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError('Address not in required shape for tiled ob_update')
+
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+        bx = 64
+        by = 1
+        if self.queue is not None:
+            self.queue.use()
+        self.ob_update_wasp_cuda(
+                grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)),
+                block=(bx, by, 1),
+                args=(ex, aux,
+                      exsh[0], exsh[1], exsh[2],
+                      pr,
+                      pr_abs2,
+                      prsh[0], prsh[1], prsh[2],
+                      ob,
+                      ob_sum_nmr,
+                      ob_sum_dnm,
+                      obsh[0], obsh[1], obsh[2],
+                      addr,
+                      pr_abs2_mean,
+                      np.float32(alpha)))
+
+    def pr_update_wasp(self, addr, pr, ob, ex, aux, pr_sum_nmr, pr_sum_dnm,
+                       beta=1):
+        if self.queue is not None:
+            self.queue.use()
+
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError('Address not in required shape for tiled ob_update')
+
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+        bx = 64
+        by = 1
+        self.pr_update_wasp_cuda(
+                grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)),
+                block=(bx, by, 1),
+                args=(ex, aux,
+                      exsh[0], exsh[1], exsh[2],
+                      pr,
+                      pr_sum_nmr,
+                      pr_sum_dnm,
+                      prsh[0], prsh[1], prsh[2],
+                      ob,
+                      obsh[0], obsh[1], obsh[2],
+                      addr,
+                      np.float32(beta)))
+
+    def avg_wasp(self, arr, nmr, dnm):
+        arrsh = [np.int32(ax) for ax in arr.shape]
+        bx = 64
+        by = 1
+
+        if self.queue is not None:
+            self.queue.use()
+        self.avg_wasp_cuda(
+                grid=(1, int((arrsh[1] + by - 1)//by), int(arrsh[0])),
+                block=(bx, by, 1),
+                args=(arr, nmr, dnm, arrsh[0], arrsh[1], arrsh[2]))
+
 
 class PositionCorrectionKernel(ab.PositionCorrectionKernel):
     from ptypy.accelerate.cuda_cupy import address_manglers
diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py
index fcb1448bb..9767ff370 100644
--- a/ptypy/accelerate/cuda_pycuda/kernels.py
+++ b/ptypy/accelerate/cuda_pycuda/kernels.py
@@ -89,7 +89,7 @@ def _bw(x,y):
                     self._CPK.crop_pad_2d_simple(y, self._tmp)
                 else:
                     self._fft2.ift(x,y)
-            
+
             self.fw = _fw
             self.bw = _bw
 
@@ -110,11 +110,11 @@ def _bw(x,y):
             def _fw(x,y):
                 self._fft1.ft(x,y)
                 self._fft3.ift(y,y)
-            
+
             def _bw(x,y):
                 self._fft2.ft(x,y)
                 self._fft3.ift(y,y)
-                
+
             self.fw = _fw
             self.bw = _bw
         else:
@@ -303,7 +303,7 @@ def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0):
                                   block=(32, 32, 1),
                                   grid=(int(fmag.shape[0]*self.nmodes), 1, 1),
                                   stream=self.queue)
-    
+
     def fmag_update_nopbound(self, f, addr, fmag, fmask):
         fdev = self.gpu.fdev
         bx = 64
@@ -322,8 +322,8 @@ def fmag_update_nopbound(self, f, addr, fmag, fmask):
                                   np.int32(self.fshape[1]),
                                   np.int32(self.fshape[2]),
                                   block=(bx, by, 1),
-                                  grid=(1, 
-                                    int((self.fshape[2] + by - 1) // by), 
+                                  grid=(1,
+                                    int((self.fshape[2] + by - 1) // by),
                                     int(fmag.shape[0]*self.nmodes)),
                                   stream=self.queue)
 
@@ -499,9 +499,9 @@ def make_aux2(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0):
                             np.float32(c_e) if ex.dtype == np.complex64 else np.float64(c_e),
                             block=(bx, by, 1),
                             grid=(
-                                1, 
-                                int((ex.shape[1] + by - 1)//by), 
-                                int(maxz * nmodes)), 
+                                1,
+                                int((ex.shape[1] + by - 1)//by),
+                                int(maxz * nmodes)),
                             stream=self.queue)
 
 
@@ -544,8 +544,8 @@ def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1):
                                        obr, obc,
                                        addr,
                                        np.float32(alpha), np.float32(tau),
-                                       block=(bx, by, 1), 
-                                       grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), 
+                                       block=(bx, by, 1),
+                                       grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)),
                                        stream=self.queue)
     """
     def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False):
@@ -590,8 +590,8 @@ def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False):
                                   block=(bx, by, 1),
                                   grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)),
                                   stream=self.queue)
-    
-    
+
+
     def _cache_object_shape(self, ob):
         oid = id(ob)
 
@@ -611,7 +611,7 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t
         self.math_type = math_type
         if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']):
             raise ValueError("accumulate and math types must be double for float")
- 
+
         self.gpu = Adict()
         self.gpu.LLden = None
         self.gpu.LLerr = None
@@ -632,9 +632,9 @@ def __init__(self, aux, nmodes=1, queue=None, accumulate_type = 'double', math_t
             'BDIM_Y': 32
         })
         self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel(
-            ('fill_b', 'fill_b_reduce'), 
+            ('fill_b', 'fill_b_reduce'),
             {
-                **subs, 
+                **subs,
                 'BDIM_X': 1024,
                 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double'
             },
@@ -814,7 +814,7 @@ def main(self, b_aux, addr, w, I):
 
 class PoUpdateKernel(ab.PoUpdateKernel):
 
-    def __init__(self, queue_thread=None, 
+    def __init__(self, queue_thread=None,
         math_type='float', accumulator_type='float'):
         super(PoUpdateKernel, self).__init__()
         # and now initialise the cuda
@@ -875,7 +875,24 @@ def __init__(self, queue_thread=None,
             'MATH_TYPE': self.math_type,
             'ACC_TYPE': self.accumulator_type
         })
-
+        self.ob_update_wasp_cuda = load_kernel("ob_update_wasp", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.pr_update_wasp_cuda = load_kernel("pr_update_wasp", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
+        self.avg_wasp_cuda = load_kernel("avg_wasp", {
+            'IN_TYPE': 'float',
+            'OUT_TYPE': 'float',
+            'MATH_TYPE': self.math_type,
+            'ACC_TYPE': self.accumulator_type
+        })
 
     def ob_update(self, addr, ob, obn, pr, ex, atomics=True):
         obsh = [np.int32(ax) for ax in ob.shape]
@@ -995,7 +1012,7 @@ def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True):
                                     np.int32(ex.shape[0]),
                                     np.int32(ex.shape[1]),
                                     np.int32(ex.shape[2]),
-                                    ob, pr, ex, addr, 
+                                    ob, pr, ex, addr,
                                     np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                     block=(16, 16, 1), grid=grid, stream=self.queue)
 
@@ -1031,7 +1048,7 @@ def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False):
             grid = (grid[0], grid[1], int(1))
             self.pr_update2_ML_cuda(prsh[-1], obsh[-2], obsh[-1],
                                  prsh[0], obsh[0], num_pods,
-                                 pr, ob, ex, addr, 
+                                 pr, ob, ex, addr,
                                  np.float32(fac) if ex.dtype == np.complex64 else np.float64(fac),
                                  block=(16, 16, 1), grid=grid, stream=self.queue)
 
@@ -1092,7 +1109,7 @@ def ob_norm_local(self, addr, ob, obn):
         obnsh = [np.int32(ax) for ax in obn.shape]
         bx = 64
         by = 1
-        self.ob_norm_local_cuda(obn, 
+        self.ob_norm_local_cuda(obn,
             obnsh[0], obnsh[1], obnsh[2],
             ob,
             obsh[0], obsh[1], obsh[2],
@@ -1101,12 +1118,12 @@ def ob_norm_local(self, addr, ob, obn):
             grid=(1, int((obnsh[1] + by - 1)//by), int(obnsh[0])),
             stream=self.queue)
 
-    def pr_norm_local(self, addr, pr, prn):        
+    def pr_norm_local(self, addr, pr, prn):
         prsh  = [np.int32(ax) for ax in pr.shape]
         prnsh = [np.int32(ax) for ax in prn.shape]
         bx = 64
         by = 1
-        self.pr_norm_local_cuda(prn, 
+        self.pr_norm_local_cuda(prn,
             prnsh[0], prnsh[1], prnsh[2],
             pr,
             prsh[0], prsh[1], prsh[2],
@@ -1115,6 +1132,80 @@ def pr_norm_local(self, addr, pr, prn):
             grid=(1, int((prnsh[1] + by - 1)//by), int(prnsh[0])),
             stream=self.queue)
 
+    def ob_update_wasp(self, addr, ob, pr, ex, aux, ob_sum_nmr, ob_sum_dnm,
+                       alpha=1):
+        pr_abs2 = (pr * pr.conj()).real
+        # this looks absolutely ugly, may need a separate kernel?
+        # PyCUDA doesn't provide sum across particular axis or a mean/avg method
+        pr_sz = pr_abs2.shape[1] * pr_abs2.shape[2]
+        pr_abs2_mean = gpuarray.empty(pr_abs2.shape[0], dtype=pr_abs2.dtype)
+        for k, pr_abs2_i in enumerate(pr_abs2):
+            pr_abs2_mean[k] = gpuarray.sum(pr_abs2_i, stream=self.queue) / pr_sz
+
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError('Address not in required shape for tiled ob_update')
+
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+        bx = 64
+        by = 1
+        self.ob_update_wasp_cuda(ex, aux,
+            exsh[0], exsh[1], exsh[2],
+            pr,
+            pr_abs2,
+            prsh[0], prsh[1], prsh[2],
+            ob,
+            ob_sum_nmr,
+            ob_sum_dnm,
+            obsh[0], obsh[1], obsh[2],
+            addr,
+            pr_abs2_mean,
+            np.float32(alpha),
+            block=(bx, by, 1),
+            grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)),
+            stream=self.queue)
+
+    def pr_update_wasp(self, addr, pr, ob, ex, aux, pr_sum_nmr, pr_sum_dnm,
+                       beta=1):
+        obsh = [np.int32(ax) for ax in ob.shape]
+        prsh = [np.int32(ax) for ax in pr.shape]
+        exsh = [np.int32(ax) for ax in ex.shape]
+
+        # atomics version only
+        if addr.shape[3] != 3 or addr.shape[2] != 5:
+            raise ValueError('Address not in required shape for tiled ob_update')
+
+        num_pods = np.int32(addr.shape[0] * addr.shape[1])
+        bx = 64
+        by = 1
+        self.pr_update_wasp_cuda(ex, aux,
+            exsh[0], exsh[1], exsh[2],
+            pr,
+            pr_sum_nmr,
+            pr_sum_dnm,
+            prsh[0], prsh[1], prsh[2],
+            ob,
+            obsh[0], obsh[1], obsh[2],
+            addr,
+            np.float32(beta),
+            block=(bx, by, 1),
+            grid=(1, int((exsh[1] + by - 1)//by), int(num_pods)),
+            stream=self.queue)
+
+    def avg_wasp(self, arr, nmr, dnm):
+        arrsh = [np.int32(ax) for ax in arr.shape]
+        bx = 64
+        by = 1
+        self.avg_wasp_cuda(arr, nmr, dnm,
+            arrsh[0], arrsh[1], arrsh[2],
+            block=(bx, by, 1),
+            grid=(1, int((arrsh[1] + by - 1)//by), int(arrsh[0])),
+            stream=self.queue)
+
 
 class PositionCorrectionKernel(ab.PositionCorrectionKernel):
     from ptypy.accelerate.cuda_pycuda import address_manglers
@@ -1133,7 +1224,7 @@ def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type=
             raise ValueError('Only float or double math is supported')
         if accumulate_type not in ['float', 'double']:
             raise ValueError('Only float or double math is supported')
-        
+
         # add kernels
         self.math_type = math_type
         self.accumulate_type = accumulate_type
diff --git a/ptypy/custom/WASP.py b/ptypy/custom/WASP.py
new file mode 100644
index 000000000..275224b9a
--- /dev/null
+++ b/ptypy/custom/WASP.py
@@ -0,0 +1,377 @@
+"""
+An implementation of the Weighted Average of Sequential Projections (WASP)
+ptychographic algorithm
+
+Authors: Andy Maiden
+"""
+import time
+
+import numpy as np
+
+from ..engines import base, projectional, register
+from ..engines.utils import projection_update_generalized, log_likelihood
+from ..core import geometry
+from ..core.manager import Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull
+from ..utils import Param
+from ..utils.verbose import logger, log
+from ..utils import parallel
+from .. import io
+from .. import utils as u
+
+__all__ = ['WASP']
+
+
+@register()
+class WASP(base.PositionCorrectionEngine):
+    """
+    Weighted Average of Sequential Projections
+
+    Defaults:
+
+    [name]
+    default = WASP
+    type = str
+    help =
+    doc =
+
+    [probe_update_start]
+    default = 2
+    type = int
+    lowlim = 0
+    help = Number of iterations before probe update starts
+
+    [subpix_start]
+    default = 0
+    type = int
+    lowlim = 0
+    help = Number of iterations before starting subpixel interpolation
+
+    [subpix]
+    default = 'linear'
+    type = str
+    help = Subpixel interpolation; 'fourier','linear' or None for no interpolation
+    choices = ['fourier','linear',None]
+
+    [update_object_first]
+    default = True
+    type = bool
+    help = If True update object before probe
+
+    [fourier_power_bound]
+    default = None
+    type = float
+    help = If rms error of model vs diffraction data is smaller than this value, Fourier constraint is met
+    doc = For Poisson-sampled data, the theoretical value for this parameter is 1/4. Set this value higher for noisy data. By default, power bound is calculated using fourier_relax_factor
+
+    [fourier_relax_factor]
+    default = 0.05
+    type = float
+    lowlim = 0.0
+    help = A factor used to calculate the Fourier power bound as 0.25 * fourier_relax_factor**2 * maximum power in diffraction data
+    doc = Set this value higher for noisy data.
+
+    [obj_smooth_std]
+    default = None
+    type = float
+    lowlim = 0
+    help = Gaussian smoothing (pixel) of the current object prior to update
+    doc = If None, smoothing is deactivated. This smoothing can be used to reduce the amplitude of spurious pixels in the outer, least constrained areas of the object.
+
+    [clip_object]
+    default = None
+    type = tuple
+    help = Clip object amplitude into this interval
+
+    [probe_center_tol]
+    default = None
+    type = float
+    lowlim = 0.0
+    help = Pixel radius around optical axes that the probe mass center must reside in
+
+    [compute_log_likelihood]
+    default = True
+    type = bool
+    help = A switch for computing the log-likelihood error (this can impact the performance of the engine)
+
+    [alpha]
+    default = 1.
+    type = float
+    lowlim = 0.0
+    help = object step parameter
+
+    [beta]
+    default = 1.
+    type = float
+    lowlim = 0.0
+    help = probe step parameter
+
+    [random_seed]
+    default = None
+    type = int
+    lowlim = 0
+    help = the seed to the random number generator for shuffling views
+    """
+
+    SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull]
+
+    def __init__(self, ptycho_parent, pars=None):
+        super().__init__(ptycho_parent, pars)
+
+        self._a = 0.
+        self._b = 1.
+        self._c = 1.
+
+        self.article = dict(
+                title='WASP: Weighted Average of Sequential Projections for ptychographic phase retrieval',
+                author='A. M. Maiden, W. Mei and P. Li',
+                journal='Optica',
+                volume=42,
+                year=2024,
+                page=42,
+                doi='doi',
+                comment='Weighted Average of Sequential Projections',
+                )
+        self.ptycho.citations.add_article(**self.article)
+
+    def engine_initialize(self):
+        super().engine_initialize()
+
+        self.error = []
+
+        # these are the sum for averaging the global object/probe
+        # they are added for each 'successive projection'
+        # nmr and dnm stand for numerator and denominator respectively
+        self.ob_sum_nmr = self.ob.copy(self.ob.ID + '_ob_sum_nmr', fill=0.)
+        self.ob_sum_dnm = self.ob.copy(self.ob.ID + '_ob_sum_dnm', fill=0., dtype='real')
+        self.pr_sum_nmr = self.pr.copy(self.pr.ID + '_pr_sum_nmr', fill=0.)
+        self.pr_sum_dnm = self.pr.copy(self.pr.ID + '_pr_sum_dnm', fill=0., dtype='real')
+
+    def engine_prepare(self):
+        """Copied from _ProjectionEngine (a large part of it)
+        """
+
+        # create RNG only once
+        self.rng = np.random.default_rng(self.p.random_seed)
+
+        if self.ptycho.new_data:
+
+            # recalculate everything
+            mean_power = 0.
+            self.pbound_scan = {}
+            for s in self.di.storages.values():
+                if self.p.fourier_power_bound is None:
+                    pb = .25 * self.p.fourier_relax_factor**2 * s.pbound_stub
+                else:
+                    pb = self.p.fourier_power_bound
+                log(4, "power bound for scan %s = %f" %(s.label, pb))
+                if not self.pbound_scan.get(s.label):
+                    self.pbound_scan[s.label] = pb
+                else:
+                    self.pbound_scan[s.label] = max(pb, self.pbound_scan[s.label])
+                mean_power += s.mean_power
+            self.mean_power = mean_power / len(self.di.storages)
+
+    def engine_iterate(self, num=1):
+        """
+        Compute `num` iterations.
+        """
+        to = 0.
+        tp = 0.
+        for it in range(num):
+            t1 = time.time()
+
+            # Overlap update
+            error_dct = self.overlap_update()
+
+            # Recenter the probe
+            self.center_probe()
+
+            t2 = time.time()
+            to += t2 - t1
+
+            # Position update
+            self.position_update()
+
+            t3 = time.time()
+            tp += t3 - t2
+
+            # count up
+            self.curiter +=1
+
+        logger.info('Time spent in Overlap update: %.2f' % to)
+        logger.info('Time spent in Position update: %.2f' % tp)
+
+        return error_dct
+
+    def overlap_update(self):
+
+        vieworder = list(self.di.views.keys())
+        # the sorting is important to ensure they are the same input to RNG in
+        # every iteration
+        vieworder.sort()
+
+        # reset the accumulated sum of object/probe before going through all
+        # the diffraction view for this iteration
+        self.ob_sum_nmr.fill(0.)
+        self.ob_sum_dnm.fill(0.)
+        self.pr_sum_nmr.fill(0.)
+        self.pr_sum_dnm.fill(0.)
+
+        self.rng.shuffle(vieworder)
+
+        error_dct = {}
+        for name in vieworder:
+            view = self.di.views[name]
+            if not view.active:
+                continue
+
+            # A copy of the old exit wave and object
+            ex_old = {}
+            ob_old = {}
+            for name, pod in view.pods.items():
+                ex_old[name] = pod.object * pod.probe
+                ob_old[name] = pod.object.copy()
+
+            error_dct[name] = self.fourier_update(view)
+
+            # update object first, then probe, and accumulate their sum for
+            # averaging after going through all the views
+            self.object_update(view, ex_old)
+            self.probe_update(view, ex_old, ob_old)
+
+        # WASP
+        self.wasp_averaging()
+
+        return error_dct
+
+    def engine_finalize(self):
+        super().engine_finalize()
+
+        # remove helper containers
+        containers = [
+            self.ob_sum_nmr,
+            self.ob_sum_dnm,
+            self.pr_sum_nmr,
+            self.pr_sum_dnm]
+
+        for c in containers:
+            logger.debug('Attempt to remove container %s' % c.ID)
+            del self.ptycho.containers[c.ID]
+
+        del self.ob_sum_nmr
+        del self.ob_sum_dnm
+        del self.pr_sum_nmr
+        del self.pr_sum_dnm
+
+    def fourier_update(self, view):
+        """
+        General implementation of Fourier update (copied from stochastic)
+
+        Parameters
+        ----------
+        view : View
+        View to diffraction data
+        """
+
+        err_fmag, err_exit = projection_update_generalized(view, a=self._a,
+                                                           b=self._b, c=self._c)
+
+        if self.p.compute_log_likelihood:
+            err_phot = log_likelihood(view)
+        else:
+            err_phot = 0.
+
+        return np.array([err_fmag, err_phot, err_exit])
+
+    def object_update(self, view, ex_old):
+
+        for name, pod in view.pods.items():
+            pr_conj = np.conj(pod.probe)
+            pr_abs2 = u.abs2(pod.probe)
+
+            self.ob_sum_nmr[pod.ob_view] += pr_conj * pod.exit
+            self.ob_sum_dnm[pod.ob_view] += pr_abs2
+
+            probe_norm = np.mean(pr_abs2)*self.p.alpha + pr_abs2
+            pod.object += 0.5*pr_conj*(pod.exit - ex_old[name]) / probe_norm
+
+    def probe_update(self, view, ex_old, ob_old):
+
+        for name, pod in view.pods.items():
+            # it is important to use ob_old, but not the updated pod.object
+            ob_conj = np.conj(ob_old[name])
+            ob_abs2 = u.abs2(ob_old[name])
+
+            self.pr_sum_nmr[pod.pr_view] += ob_conj * pod.exit
+            self.pr_sum_dnm[pod.pr_view] += ob_abs2
+
+            object_norm = self.p.beta + ob_abs2
+            pod.probe += ob_conj*(pod.exit - ex_old[name]) / object_norm
+
+    def wasp_averaging(self):
+
+        for name, s in self.ob.storages.items():
+            ob_sum_nmr = self.ob_sum_nmr.storages[name].data
+            ob_sum_dnm = self.ob_sum_dnm.storages[name].data
+
+            parallel.allreduce(ob_sum_nmr)
+            parallel.allreduce(ob_sum_dnm)
+
+            # avoid division by zero
+            is_zero = np.isclose(ob_sum_dnm, 0)
+            s.data = np.where(is_zero, ob_sum_nmr, ob_sum_nmr / ob_sum_dnm)
+
+            self.clip_object(s)
+
+        for name, p in self.pr.storages.items():
+            pr_sum_nmr = self.pr_sum_nmr.storages[name].data
+            pr_sum_dnm = self.pr_sum_dnm.storages[name].data
+
+            parallel.allreduce(pr_sum_nmr)
+            parallel.allreduce(pr_sum_dnm)
+
+            # avoid division by zero
+            is_zero = np.isclose(pr_sum_dnm, 0)
+            p.data = np.where(is_zero, pr_sum_nmr, pr_sum_nmr / pr_sum_dnm)
+
+    def clip_object(self, ob):
+        """Copied from _ProjectionEngine
+        """
+
+        # Clip object (This call takes like one ms. Not time critical)
+        if self.p.clip_object is not None:
+            clip_min, clip_max = self.p.clip_object
+            ampl_obj = np.abs(ob.data)
+            phase_obj = np.exp(1j * np.angle(ob.data))
+            too_high = (ampl_obj > clip_max)
+            too_low = (ampl_obj < clip_min)
+            ob.data[too_high] = clip_max * phase_obj[too_high]
+            ob.data[too_low] = clip_min * phase_obj[too_low]
+
+    def center_probe(self):
+        """Copied from _ProjectionEngine
+        """
+
+        if self.p.probe_center_tol is not None:
+            for name, pr_s in self.pr.storages.items():
+                c1 = u.mass_center(u.abs2(pr_s.data).sum(0))
+                c2 = np.asarray(pr_s.shape[-2:]) // 2
+                # fft convention should however use geometry instead
+                if u.norm(c1 - c2) < self.p.probe_center_tol:
+                    break
+                # SC: possible BUG here, wrong input parameter
+                pr_s.data[:] = u.shift_zoom(pr_s.data, (1.,)*3,
+                        (0, c1[0], c1[1]), (0, c2[0], c2[1]))
+
+                # shift the object
+                ob_s = pr_s.views[0].pod.ob_view.storage
+                ob_s.data[:] = u.shift_zoom(ob_s.data, (1.,)*3,
+                        (0, c1[0], c1[1]), (0, c2[0], c2[1]))
+
+                # shift the exit waves, loop through different exit wave views
+                for pv in pr_s.views:
+                    pv.pod.exit = u.shift_zoom(pv.pod.exit, (1.,)*2,
+                            (c1[0], c1[1]), (c2[0], c2[1]))
+
+                log(4,'Probe recentered from %s to %s'
+                            % (str(tuple(c1)), str(tuple(c2))))
diff --git a/ptypy/custom/WASP_cupy.py b/ptypy/custom/WASP_cupy.py
new file mode 100644
index 000000000..06d1507fb
--- /dev/null
+++ b/ptypy/custom/WASP_cupy.py
@@ -0,0 +1,542 @@
+import time
+
+import numpy as np
+import cupy as cp
+import cupyx
+
+from ..engines import register
+from .WASP_serial import WASP_serial
+from ..utils.verbose import logger, log
+from ..utils import parallel
+from .. import utils as u
+
+from ..accelerate.cuda_cupy import get_context
+from ..accelerate.cuda_cupy.kernels import (FourierUpdateKernel,
+    AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel,
+    PropagationKernel)
+from ..accelerate.cuda_cupy.array_utils import (ArrayUtilsKernel,
+    GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel,
+    MaxAbs2Kernel, MassCenterKernel, Abs2SumKernel, InterpolatedShiftKernel)
+from ..accelerate.cuda_cupy.mem_utils import make_pagelocked_paired_arrays as mppa
+from ..accelerate.cuda_cupy.mem_utils import GpuDataManager
+from ..accelerate.cuda_cupy.multi_gpu import get_multi_gpu_communicator
+
+
+__all__ = ['WASP_cupy']
+
+EX_MA_BLOCKS_RATIO = 2
+# can be used to limit the number of blocks, simulating that they don't fit
+MAX_BLOCKS = 99999
+# MAX_BLOCKS = 10  # can be used to limit the number of blocks, simulating that they don't fit
+
+@register()
+class WASP_cupy(WASP_serial):
+    """
+    Weighted Average of Sequential Projections
+
+    Defaults:
+
+    [name]
+    default = WASP_cupy
+    type = str
+    help =
+    doc =
+
+    [fft_lib]
+    default = reikna
+    type = str
+    help = Choose the cupy-compatible FFT module.
+    doc = One of:
+      - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming)
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages)
+    choices = 'reikna','cuda','skcuda'
+    userlevel = 2
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Weighted Average of Sequential Projections
+        """
+        super().__init__(ptycho_parent, pars)
+        self.ma_data = None
+        self.mag_data = None
+        self.ex_data = None
+        self.multigpu = None
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        # Context, Multi GPU communicator and Stream (needs to be in this order)
+        self.queue = get_context(new_queue=False)
+        self.multigpu = get_multi_gpu_communicator()
+
+        # initialise kernels for centring probe if required
+        if self.p.probe_center_tol is not None:
+            # mass center kernel
+            self.MCK = MassCenterKernel(queue=self.queue)
+            # absolute sum kernel
+            self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue)
+            # interpolated shift kernel
+            self.ISK = InterpolatedShiftKernel(queue=self.queue)
+
+        # Clip Magnitudes Kernel
+        self.CMK = ClipMagnitudesKernel(queue=self.queue)
+
+        super().engine_initialize()
+        self.qu_htod = cp.cuda.Stream()
+        self.qu_dtoh = cp.cuda.Stream()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        fpc = 0
+
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = max(scan.max_frames_per_block, fpc)
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                         scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = cp.asarray(aux)
+
+            # setup kernels, one for each SCAN.
+            log(4, "Setting up FourierUpdateKernel")
+            kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
+            kern.FUK.fshape = (1,) + kern.FUK.fshape[1:]
+            kern.FUK.allocate()
+
+            log(4, "Setting up PoUpdateKernel")
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            log(4, "Setting up AuxiliaryWaveKernel")
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            log(4, "Setting up ArrayUtilsKernel")
+            kern.AUK = ArrayUtilsKernel(queue=self.queue)
+
+            #log(4, "Setting up TransposeKernel")
+            #kern.TK = TransposeKernel(queue=self.queue)
+
+            log(4, "setting up MaxAbs2Kernel")
+            kern.MAK = MaxAbs2Kernel(queue=self.queue)
+
+            log(4, "Setting up PropagationKernel")
+            kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                log(4, "Setting up position correction")
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK.allocate()
+
+        ex_mem = 0
+        mag_mem = 0
+        for scan, kern in self.kernels.items():
+            if kern.scanmodel in ("GradFull", "BlockGradFull"):
+                ex_mem = max(kern.aux.nbytes * 1, ex_mem)
+            else:
+                ex_mem = max(kern.aux.nbytes * fpc, ex_mem)
+            mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem)
+        ma_mem = mag_mem
+        mem = cp.cuda.runtime.memGetInfo()[0]
+        blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
+        # leave 200MB room for safety
+        fit = int(mem - 200 * 1024 * 1024) // blk
+        if not fit:
+            log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            raise SystemExit("ptypy has been exited.")
+
+        # TODO grow blocks dynamically
+        nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
+        nma = min(fit, MAX_BLOCKS)
+
+        log(3, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
+        # reset memory or create new
+        self.ex_data = GpuDataManager(ex_mem, 0, nex, True)
+        self.ma_data = GpuDataManager(ma_mem, 0, nma, False)
+        self.mag_data = GpuDataManager(mag_mem, 0, nma, False)
+        log(4, "Kernel setup completed")
+
+    def engine_prepare(self):
+        super().engine_prepare()
+
+        for name, s in self.ob.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu, s.data = mppa(s.data)
+
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.addr_gpu = cp.asarray(prep.addr)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+        for label, d in self.ptycho.new_data:
+            dID = d.ID
+            prep = self.diff_info[dID]
+            pID, oID, eID = prep.poe_IDs
+
+            prep.ma_sum_gpu = cp.asarray(prep.ma_sum)
+            prep.err_fourier_gpu = cp.asarray(prep.err_fourier)
+            prep.err_phot_gpu = cp.asarray(prep.err_phot)
+            prep.err_exit_gpu = cp.asarray(prep.err_exit)
+            if self.do_position_refinement:
+                prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu)
+
+            # these are the sum for averaging the global object/probe
+            # they are added for each 'successive projection'
+            # nmr and dnm stand for numerator and denominator respectively
+            prep.ob_sum_nmr = cp.asarray(prep.ob_sum_nmr)
+            prep.ob_sum_dnm = cp.asarray(prep.ob_sum_dnm)
+            prep.pr_sum_nmr = cp.asarray(prep.pr_sum_nmr)
+            prep.pr_sum_dnm = cp.asarray(prep.pr_sum_dnm)
+
+            # prepare page-locked mems:
+            ma = self.ma.S[dID].data.astype(np.float32)
+            prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C")
+            prep.ma[:] = ma
+            ex = self.ex.S[eID].data
+            prep.ex = cupyx.empty_pinned(ex.shape, ex.dtype, order="C")
+            prep.ex[:] = ex
+            mag = prep.mag
+            prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C")
+            prep.mag[:] = mag
+
+            self.ex_data.add_data_block()
+            self.ma_data.add_data_block()
+            self.mag_data.add_data_block()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        self.dID_list = list(self.di.S.keys())
+        error = {}
+        for it in range(num):
+
+            for iblock, dID in enumerate(self.dID_list):
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                POK = kern.POK
+                MAK = kern.MAK
+                PROP = kern.PROP
+
+                # get aux buffer
+                aux = kern.aux
+
+                # local references
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+
+                # the copy is important to prevent vieworder being modified,
+                # which is always sorted
+                vieworder_all = prep.view_IDs_all.copy()
+                prep.rng.shuffle(vieworder_all)
+
+                # reset the accumulated sum of object/probe before going
+                # through all the diffraction view for this iteration
+                ob_sum_nmr = prep.ob_sum_nmr
+                ob_sum_dnm = prep.ob_sum_dnm
+                pr_sum_nmr = prep.pr_sum_nmr
+                pr_sum_dnm = prep.pr_sum_dnm
+                ob_sum_nmr.fill(0)
+                ob_sum_dnm.fill(0)
+                pr_sum_nmr.fill(0)
+                pr_sum_dnm.fill(0)
+
+                # Schedule ex, ma, mag to device
+                ev_ex, ex_full, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod)
+                ev_mag, mag_full, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod)
+                ev_ma, ma_full, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod)
+
+                # Reference to ex, ma and mag
+                prep.ex_full = ex_full
+                prep.mag_full = mag_full
+                prep.ma_full = ma_full
+
+                # synchronize h2d stream with compute stream
+                self.queue.wait_event(ev_ex)
+
+                # Iterate through views
+                for vname in vieworder_all:
+                    # only proceed for active view, which is in prep.view_IDs
+                    # for this particular rank
+                    if vname not in prep.view_IDs:
+                        continue
+
+                    # Get local adress and arrays
+                    i = prep.view_IDs.index(vname)
+                    addr = prep.addr_gpu[i,None]
+                    ex_from, ex_to = prep.addr_ex[i]
+                    ex = prep.ex_full[ex_from:ex_to]
+                    mag = prep.mag_full[i,None]
+                    ma = prep.ma_full[i,None]
+                    ma_sum = prep.ma_sum_gpu[i,None]
+                    err_phot = prep.err_phot_gpu[i,None]
+                    err_fourier = prep.err_fourier_gpu[i,None]
+                    err_exit = prep.err_exit_gpu[i,None]
+
+                    ## build auxilliary wave
+                    AWK.make_aux(aux, addr, ob, pr, ex, c_po=self._c, c_e=1-self._c)
+
+                    ## forward FFT
+                    PROP.fw(aux, aux)
+
+                    ## Deviation from measured data
+                    self.queue.wait_event(ev_mag)
+                    if self.p.compute_fourier_error:
+                        self.queue.wait_event(ev_ma)
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                    else:
+                        FUK.fourier_deviation(aux, addr, mag)
+                        self.queue.wait_event(ev_ma)
+                    FUK.fmag_update_nopbound(aux, addr, mag, ma)
+
+                    ## backward FFT
+                    PROP.bw(aux, aux)
+
+                    ## build exit wave
+                    AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b))
+                    if self.p.compute_exit_error:
+                        FUK.exit_error(aux,addr)
+                        FUK.error_reduce(addr, err_exit)
+
+                    ## build auxilliary wave (ob * pr product)
+                    AWK.build_aux2_no_ex(aux, addr, ob, pr)
+
+                    # WASP ob and pr local update
+                    ob_old = ob.copy()
+                    POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
+                                       ob_sum_dnm, alpha=self.p.alpha)
+                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                       pr_sum_dnm, beta=self.p.beta)
+
+                    ## compute log-likelihood
+                    if self.p.compute_log_likelihood:
+                        PROP.fw(aux, aux)
+                        FUK.log_likelihood2(aux, addr, mag, ma, err_phot)
+
+                # WASP averaging
+
+                # collect the sums
+                self.multigpu.allReduceSum(ob_sum_nmr)
+                self.multigpu.allReduceSum(ob_sum_dnm)
+                self.multigpu.allReduceSum(pr_sum_nmr)
+                self.multigpu.allReduceSum(pr_sum_dnm)
+
+                POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
+                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+
+                # Clip object
+                if self.p.clip_object is not None:
+                    self.clip_object(ob)
+                    self.queue.synchronize()
+
+                data_ex.record_done(self.queue, 'compute')
+                if iblock + len(self.ex_data) < len(self.dID_list):
+                    data_ex.from_gpu(self.qu_dtoh)
+
+            # swap direction
+            self.dID_list.reverse()
+
+            # Re-center probe
+            self.center_probe()
+
+            # position update
+            self.position_update()
+
+            self.curiter += 1
+            self.ex_data.syncback = False
+
+        # finish all the compute
+        self.queue.synchronize()
+
+        for name, s in self.ob.S.items():
+            cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
+        for name, s in self.pr.S.items():
+            cp.asnumpy(s.gpu, stream=self.queue, out=s.data)
+
+        for dID, prep in self.diff_info.items():
+            err_fourier = prep.err_fourier_gpu.get()
+            err_phot = prep.err_phot_gpu.get()
+            err_exit = prep.err_exit_gpu.get()
+            errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
+            error.update(zip(prep.view_IDs, errs))
+
+        # wait for the async transfers
+        self.qu_dtoh.synchronize()
+
+        self.error = error
+        return error
+
+    def clip_object(self, ob):
+        """
+        Clips magnitudes of object into given range.
+        """
+        cmin, cmax = self.p.clip_object
+        self.CMK.clip_magnitudes_to_range(ob, cmin, cmax)
+
+    def position_update(self):
+        """
+        Position refinement
+        (Copied from _ProjectionEngine_cupy)
+        """
+        if not self.do_position_refinement or (not self.curiter):
+            return
+        do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
+        use_tiles = (not self.p.probe_update_cuda_atomics) or ( not self.p.object_update_cuda_atomics)
+
+        # Update positions
+        if do_update_pos:
+            self.queue.use()
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.di.S.keys():
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ma = self.ma.S[dID].gpu
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr_gpu
+                original_addr = prep.original_addr
+                mangled_addr = prep.mangled_addr_gpu
+                mag = prep.mag
+                ma_sum = prep.ma_sum
+                err_fourier = prep.err_fourier_gpu
+                error_state = prep.error_state_gpu
+
+                PCK = kern.PCK
+                TK = kern.TK
+                PROP = kern.PROP
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                PROP.fw(aux, aux)
+                if self.p.position_refinement.metric == "fourier":
+                    PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                    PCK.error_reduce(addr, err_fourier)
+                if self.p.position_refinement.metric == "photon":
+                    PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr,
+                                            src=err_fourier.data.ptr,
+                                            size=err_fourier.nbytes,
+                                            kind=3,  # device to device
+                                            stream=self.queue.ptr)
+
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address( i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    PROP.fw(aux, aux)
+                    if self.p.position_refinement.metric == "fourier":
+                        PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                        PCK.error_reduce(mangled_addr, err_fourier)
+                    if self.p.position_refinement.metric == "photon":
+                        PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                    PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
+
+                cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
+                                            src=error_state.data.ptr,
+                                            size=err_fourier.nbytes,
+                                            kind=3,
+                                            stream=self.queue.ptr)  # d2d
+                if use_tiles:
+                    s1 = addr.shape[0] * addr.shape[1]
+                    s2 = addr.shape[2] * addr.shape[3]
+                    TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
+
+    def center_probe(self):
+        if self.p.probe_center_tol is not None:
+            for name, pr_s in self.pr.storages.items():
+                psum_d = self.A2SK.abs2sum(pr_s.gpu)
+                c1 = self.MCK.mass_center(psum_d).get()
+                c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype)
+
+                shift = c2 - c1
+                # exit if the current center of mass is within the tolerance
+                if u.norm(shift) < self.p.probe_center_tol:
+                    break
+
+                # shift the probe
+                pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift)
+
+                # shift the object
+                ob_s = pr_s.views[0].pod.ob_view.storage
+                ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift)
+
+                # shift the exit waves
+                for dID in self.di.S.keys():
+                    prep = self.diff_info[dID]
+                    pID, oID, eID = prep.poe_IDs
+                    if pID == name:
+                        prep.ex_full = self.ISK.interpolate_shift(prep.ex_full,
+                                                                  shift)
+
+                log(4, 'Probe recentered from %s to %s'
+                    % (str(tuple(c1)), str(tuple(c2))))
+
+    def engine_finalize(self):
+        """
+        clear GPU data
+        """
+        self.ex_data = None
+        self.ma_data = None
+        self.mag_data = None
+
+        for name, s in self.ob.S.items():
+            del s.gpu
+        for name, s in self.pr.S.items():
+            del s.gpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+
+        # copy data to cpu
+        # this kills the pagelock memory (otherwise we get segfaults in h5py)
+        for name, s in self.pr.S.items():
+            s.data = np.copy(s.data)
+        for name, s in self.ob.S.items():
+            s.data = np.copy(s.data)
+
+        super().engine_finalize()
diff --git a/ptypy/custom/WASP_pycuda.py b/ptypy/custom/WASP_pycuda.py
new file mode 100644
index 000000000..f68b48434
--- /dev/null
+++ b/ptypy/custom/WASP_pycuda.py
@@ -0,0 +1,538 @@
+import time
+
+import numpy as np
+from pycuda import gpuarray
+import pycuda.driver as cuda
+
+from ..engines import register
+from .WASP_serial import WASP_serial
+from ..utils.verbose import logger, log
+from ..utils import parallel
+from .. import utils as u
+
+from ..accelerate.base.engines import projectional_serial
+from ..accelerate.cuda_pycuda import get_context
+from ..accelerate.cuda_pycuda.kernels import (FourierUpdateKernel,
+    AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel,
+    PropagationKernel, RealSupportKernel, FourierSupportKernel)
+from ..accelerate.cuda_pycuda.array_utils import (ArrayUtilsKernel,
+    GaussianSmoothingKernel, TransposeKernel, ClipMagnitudesKernel,
+    MaxAbs2Kernel, MassCenterKernel, Abs2SumKernel,InterpolatedShiftKernel)
+from ..accelerate.cuda_pycuda.mem_utils import make_pagelocked_paired_arrays as mppa
+from ..accelerate.cuda_pycuda.mem_utils import GpuDataManager
+from ..accelerate.cuda_pycuda.multi_gpu import get_multi_gpu_communicator
+
+
+__all__ = ['WASP_pycuda']
+
+EX_MA_BLOCKS_RATIO = 2
+MAX_BLOCKS = 99999  # can be used to limit the number of blocks, simulating that they don't fit
+#MAX_BLOCKS = 10  # can be used to limit the number of blocks, simulating that they don't fit
+
+@register()
+class WASP_pycuda(WASP_serial):
+    """
+    Weighted Average of Sequential Projections
+
+    Defaults:
+
+    [name]
+    default = WASP_pycuda
+    type = str
+    help =
+    doc =
+
+    [fft_lib]
+    default = reikna
+    type = str
+    help = Choose the pycuda-compatible FFT module.
+    doc = One of:
+      - ``'reikna'`` : the reikna packaga (fast load, competitive compute for streaming)
+      - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU)
+      - ``'skcuda'`` : scikit-cuda (fast load, slowest compute due to additional store/load stages)
+    choices = 'reikna','cuda','skcuda'
+    userlevel = 2
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        """
+        Weighted Average of Sequential Projections
+        """
+        super().__init__(ptycho_parent, pars)
+        self.ma_data = None
+        self.mag_data = None
+        self.ex_data = None
+        self.multigpu = None
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction.
+        """
+        # Context, Multi GPU communicator and Stream (needs to be in this order)
+        self.context, self.queue = get_context(new_context=True, new_queue=False)
+        self.multigpu = get_multi_gpu_communicator()
+        self.context, self.queue = get_context(new_context=False, new_queue=True)
+
+        # initialise kernels for centring probe if required
+        if self.p.probe_center_tol is not None:
+            # mass center kernel
+            self.MCK = MassCenterKernel(queue=self.queue)
+            # absolute sum kernel
+            self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue)
+            # interpolated shift kernel
+            self.ISK = InterpolatedShiftKernel(queue=self.queue)
+
+        # Clip Magnitudes Kernel
+        self.CMK = ClipMagnitudesKernel(queue=self.queue)
+
+        super().engine_initialize()
+        self.qu_htod = cuda.Stream()
+        self.qu_dtoh = cuda.Stream()
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        """
+        fpc = 0
+
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = max(scan.max_frames_per_block, fpc)
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                         scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = gpuarray.to_gpu(aux)
+
+            # setup kernels, one for each SCAN.
+            log(4, "Setting up FourierUpdateKernel")
+            kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
+            kern.FUK.fshape = (1,) + kern.FUK.fshape[1:]
+            kern.FUK.allocate()
+
+            log(4, "Setting up PoUpdateKernel")
+            kern.POK = PoUpdateKernel(queue_thread=self.queue)
+            kern.POK.allocate()
+
+            log(4, "Setting up AuxiliaryWaveKernel")
+            kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue)
+            kern.AWK.allocate()
+
+            log(4, "Setting up ArrayUtilsKernel")
+            kern.AUK = ArrayUtilsKernel(queue=self.queue)
+
+            #log(4, "Setting up TransposeKernel")
+            #kern.TK = TransposeKernel(queue=self.queue)
+
+            log(4, "setting up MaxAbs2Kernel")
+            kern.MAK = MaxAbs2Kernel(queue=self.queue)
+
+            log(4, "Setting up PropagationKernel")
+            kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib)
+            kern.PROP.allocate()
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                log(4, "Setting up position correction")
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK.allocate()
+
+        ex_mem = 0
+        mag_mem = 0
+        for scan, kern in self.kernels.items():
+            if kern.scanmodel in ("GradFull", "BlockGradFull"):
+                ex_mem = max(kern.aux.nbytes * 1, ex_mem)
+            else:
+                ex_mem = max(kern.aux.nbytes * fpc, ex_mem)
+            mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem)
+        ma_mem = mag_mem
+        mem = cuda.mem_get_info()[0]
+        blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem
+        fit = int(mem - 200 * 1024 * 1024) // blk  # leave 200MB room for safety
+        if not fit:
+            log(1,"Cannot fit memory into device, if possible reduce frames per block. Exiting...")
+            raise SystemExit("ptypy has been exited.")
+
+        # TODO grow blocks dynamically
+        nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
+        nma = min(fit, MAX_BLOCKS)
+
+        log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
+        # reset memory or create new
+        self.ex_data = GpuDataManager(ex_mem, 0, nex, True)
+        self.ma_data = GpuDataManager(ma_mem, 0, nma, False)
+        self.mag_data = GpuDataManager(mag_mem, 0, nma, False)
+        log(4, "Kernel setup completed")
+
+    def engine_prepare(self):
+        super().engine_prepare()
+
+        for name, s in self.ob.S.items():
+            s.gpu, s.data = mppa(s.data)
+        for name, s in self.pr.S.items():
+            s.gpu, s.data = mppa(s.data)
+
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.addr_gpu = gpuarray.to_gpu(prep.addr)
+            if self.do_position_refinement:
+                prep.mangled_addr_gpu = prep.addr_gpu.copy()
+
+        for label, d in self.ptycho.new_data:
+            dID = d.ID
+            prep = self.diff_info[dID]
+            pID, oID, eID = prep.poe_IDs
+
+            prep.ma_sum_gpu = gpuarray.to_gpu(prep.ma_sum)
+            prep.err_fourier_gpu = gpuarray.to_gpu(prep.err_fourier)
+            prep.err_phot_gpu = gpuarray.to_gpu(prep.err_phot)
+            prep.err_exit_gpu = gpuarray.to_gpu(prep.err_exit)
+            if self.do_position_refinement:
+                prep.error_state_gpu = gpuarray.empty_like(prep.err_fourier_gpu)
+
+            # these are the sum for averaging the global object/probe
+            # they are added for each 'successive projection'
+            # nmr and dnm stand for numerator and denominator respectively
+            prep.ob_sum_nmr = gpuarray.to_gpu(prep.ob_sum_nmr)
+            prep.ob_sum_dnm = gpuarray.to_gpu(prep.ob_sum_dnm )
+            prep.pr_sum_nmr = gpuarray.to_gpu(prep.pr_sum_nmr)
+            prep.pr_sum_dnm = gpuarray.to_gpu(prep.pr_sum_dnm )
+
+            # prepare page-locked mems:
+            ma = self.ma.S[dID].data.astype(np.float32)
+            prep.ma = cuda.pagelocked_empty(ma.shape, ma.dtype, order="C", mem_flags=4)
+            prep.ma[:] = ma
+            ex = self.ex.S[eID].data
+            prep.ex = cuda.pagelocked_empty(ex.shape, ex.dtype, order="C", mem_flags=4)
+            prep.ex[:] = ex
+            mag = prep.mag
+            prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4)
+            prep.mag[:] = mag
+
+            self.ex_data.add_data_block()
+            self.ma_data.add_data_block()
+            self.mag_data.add_data_block()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        self.dID_list = list(self.di.S.keys())
+        error = {}
+        for it in range(num):
+
+            for iblock, dID in enumerate(self.dID_list):
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                POK = kern.POK
+                MAK = kern.MAK
+                PROP = kern.PROP
+
+                # get aux buffer
+                aux = kern.aux
+
+                # local references
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+
+                # the copy is important to prevent vieworder being modified,
+                # which is always sorted
+                vieworder_all = prep.view_IDs_all.copy()
+                prep.rng.shuffle(vieworder_all)
+
+                # reset the accumulated sum of object/probe before going
+                # through all the diffraction view for this iteration
+                ob_sum_nmr = prep.ob_sum_nmr
+                ob_sum_dnm = prep.ob_sum_dnm
+                pr_sum_nmr = prep.pr_sum_nmr
+                pr_sum_dnm = prep.pr_sum_dnm
+                ob_sum_nmr.fill(0)
+                ob_sum_dnm.fill(0)
+                pr_sum_nmr.fill(0)
+                pr_sum_dnm.fill(0)
+
+                # Schedule ex, ma, mag to device
+                ev_ex, ex_full, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod)
+                ev_mag, mag_full, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod)
+                ev_ma, ma_full, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod)
+
+                # Reference to ex, ma and mag
+                prep.ex_full = ex_full
+                prep.mag_full = mag_full
+                prep.ma_full = ma_full
+
+                ## synchronize h2d stream with compute stream
+                self.queue.wait_for_event(ev_ex)
+
+                # Iterate through views
+                for vname in vieworder_all:
+                    # only proceed for active view, which is in prep.view_IDs
+                    # for this particular rank
+                    if vname not in prep.view_IDs:
+                        continue
+
+                    # Get local adress and arrays
+                    i = prep.view_IDs.index(vname)
+                    addr = prep.addr_gpu[i,None]
+                    ex_from, ex_to = prep.addr_ex[i]
+                    ex = prep.ex_full[ex_from:ex_to]
+                    mag = prep.mag_full[i,None]
+                    ma = prep.ma_full[i,None]
+                    ma_sum = prep.ma_sum_gpu[i,None]
+                    err_phot = prep.err_phot_gpu[i,None]
+                    err_fourier = prep.err_fourier_gpu[i,None]
+                    err_exit = prep.err_exit_gpu[i,None]
+
+                    ## build auxilliary wave
+                    AWK.make_aux(aux, addr, ob, pr, ex, c_po=self._c, c_e=1-self._c)
+
+                    ## forward FFT
+                    PROP.fw(aux, aux)
+
+                    ## Deviation from measured data
+                    self.queue.wait_for_event(ev_mag)
+                    if self.p.compute_fourier_error:
+                        self.queue.wait_for_event(ev_ma)
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                    else:
+                        FUK.fourier_deviation(aux, addr, mag)
+                        self.queue.wait_for_event(ev_ma)
+                    FUK.fmag_update_nopbound(aux, addr, mag, ma)
+
+                    ## backward FFT
+                    PROP.bw(aux, aux)
+
+                    ## build exit wave
+                    AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b))
+                    if self.p.compute_exit_error:
+                        FUK.exit_error(aux,addr)
+                        FUK.error_reduce(addr, err_exit)
+
+                    ## build auxilliary wave (ob * pr product)
+                    AWK.build_aux2_no_ex(aux, addr, ob, pr)
+
+                    # WASP ob and pr local update
+                    ob_old = ob.copy()
+                    POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
+                                       ob_sum_dnm, alpha=self.p.alpha)
+                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                       pr_sum_dnm, beta=self.p.beta)
+
+
+                    ## compute log-likelihood
+                    if self.p.compute_log_likelihood:
+                        PROP.fw(aux, aux)
+                        FUK.log_likelihood2(aux, addr, mag, ma, err_phot)
+
+                # WASP averaging
+
+                # collect the sums
+                self.multigpu.allReduceSum(ob_sum_nmr)
+                self.multigpu.allReduceSum(ob_sum_dnm)
+                self.multigpu.allReduceSum(pr_sum_nmr)
+                self.multigpu.allReduceSum(pr_sum_dnm)
+
+                POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
+                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+
+                # Clip object
+                if self.p.clip_object is not None:
+                    self.clip_object(ob)
+                    self.queue.synchronize()
+
+                data_ex.record_done(self.queue, 'compute')
+                if iblock + len(self.ex_data) < len(self.dID_list):
+                    data_ex.from_gpu(self.qu_dtoh)
+
+            # swap direction
+            self.dID_list.reverse()
+
+            # Re-center probe
+            self.center_probe()
+
+            # position update
+            self.position_update()
+
+            self.curiter += 1
+            self.ex_data.syncback = False
+
+        # finish all the compute
+        self.queue.synchronize()
+
+        for name, s in self.ob.S.items():
+            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+        for name, s in self.pr.S.items():
+            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+
+        for dID, prep in self.diff_info.items():
+            err_fourier = prep.err_fourier_gpu.get()
+            err_phot = prep.err_phot_gpu.get()
+            err_exit = prep.err_exit_gpu.get()
+            errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
+            error.update(zip(prep.view_IDs, errs))
+
+        # wait for the async transfers
+        self.qu_dtoh.synchronize()
+
+        self.error = error
+        return error
+
+    def clip_object(self, ob):
+        """
+        Clips magnitudes of object into given range.
+        """
+        cmin, cmax = self.p.clip_object
+        self.CMK.clip_magnitudes_to_range(ob, cmin, cmax)
+
+    def position_update(self):
+        """
+        Position refinement
+        (Copied from _ProjectionEngine_pycuda)
+        """
+        if not self.do_position_refinement or (not self.curiter):
+            return
+        do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics)
+
+        # Update positions
+        if do_update_pos:
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.di.S.keys():
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ma = self.ma.S[dID].gpu
+                ob = self.ob.S[oID].gpu
+                pr = self.pr.S[pID].gpu
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr_gpu
+                original_addr = prep.original_addr
+                mangled_addr = prep.mangled_addr_gpu
+                mag = prep.mag
+                ma_sum = prep.ma_sum
+                err_fourier = prep.err_fourier_gpu
+                error_state = prep.error_state_gpu
+
+                PCK = kern.PCK
+                TK  = kern.TK
+                PROP = kern.PROP
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                PROP.fw(aux, aux)
+                if self.p.position_refinement.metric == "fourier":
+                    PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                    PCK.error_reduce(addr, err_fourier)
+                if self.p.position_refinement.metric == "photon":
+                    PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                cuda.memcpy_dtod(dest=error_state.ptr,
+                                    src=err_fourier.ptr,
+                                    size=err_fourier.nbytes)
+
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    PROP.fw(aux, aux)
+                    if self.p.position_refinement.metric == "fourier":
+                        PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                        PCK.error_reduce(mangled_addr, err_fourier)
+                    if self.p.position_refinement.metric == "photon":
+                        PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                    PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
+
+                cuda.memcpy_dtod(dest=err_fourier.ptr,
+                                    src=error_state.ptr,
+                                    size=err_fourier.nbytes)
+                if use_tiles:
+                    s1 = addr.shape[0] * addr.shape[1]
+                    s2 = addr.shape[2] * addr.shape[3]
+                    TK.transpose(addr.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
+
+    def center_probe(self):
+        if self.p.probe_center_tol is not None:
+            for name, pr_s in self.pr.storages.items():
+                psum_d = self.A2SK.abs2sum(pr_s.gpu)
+                c1 = self.MCK.mass_center(psum_d).get()
+                c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype)
+
+                shift = c2 - c1
+                # exit if the current center of mass is within the tolerance
+                if u.norm(shift) < self.p.probe_center_tol:
+                    break
+
+                # shift the probe
+                pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift)
+
+                # shift the object
+                ob_s = pr_s.views[0].pod.ob_view.storage
+                ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift)
+
+                # shift the exit waves
+                for dID in self.di.S.keys():
+                    prep = self.diff_info[dID]
+                    pID, oID, eID = prep.poe_IDs
+                    if pID == name:
+                        prep.ex_full = self.ISK.interpolate_shift(prep.ex_full,
+                                shift)
+
+                log(4,'Probe recentered from %s to %s'
+                            % (str(tuple(c1)), str(tuple(c2))))
+
+    def engine_finalize(self):
+        """
+        clear GPU data and destroy context.
+        """
+        self.ex_data = None
+        self.ma_data = None
+        self.mag_data = None
+
+        for name, s in self.ob.S.items():
+            del s.gpu
+        for name, s in self.pr.S.items():
+            del s.gpu
+        for dID, prep in self.diff_info.items():
+            prep.addr = prep.addr_gpu.get()
+
+        # copy data to cpu
+        # this kills the pagelock memory (otherwise we get segfaults in h5py)
+        for name, s in self.pr.S.items():
+            s.data = np.copy(s.data)
+        for name, s in self.ob.S.items():
+            s.data = np.copy(s.data)
+
+        super().engine_finalize()
diff --git a/ptypy/custom/WASP_serial.py b/ptypy/custom/WASP_serial.py
new file mode 100644
index 000000000..96391ea27
--- /dev/null
+++ b/ptypy/custom/WASP_serial.py
@@ -0,0 +1,450 @@
+import numpy as np
+import time
+
+from ..engines import register
+from .WASP import WASP
+from ..utils.verbose import logger, log
+from ..utils import parallel
+from .. import utils as u
+from ..accelerate.base.kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
+from ..accelerate.base import array_utils as au
+from ..accelerate.base.engines import projectional_serial
+
+__all__ = ['WASP_serial']
+
+
+@register()
+class WASP_serial(WASP):
+    """
+    Weighted Average of Sequential Projections
+
+    Defaults:
+
+    [name]
+    default = WASP_serial
+    type = str
+    help =
+    doc =
+
+    [compute_exit_error]
+    default = False
+    type = bool
+    help = A switch for computing the exitwave error (this can impact the performance of the engine)
+
+    [compute_fourier_error]
+    default = False
+    type = bool
+    help = A switch for computing the fourier error (this can impact the performance of the engine)
+    """
+
+    def __init__(self, ptycho_parent, pars=None):
+        super().__init__(ptycho_parent, pars)
+
+        self.benchmark = u.Param()
+
+        # Stores all information needed with respect to the diffraction storages.
+        self.diff_info = {}
+        self.kernels = {}
+
+    def engine_initialize(self):
+        """
+        Prepare for reconstruction. (Copied from _ProjectionEngine_serial)
+        """
+
+        super().engine_initialize()
+        self._reset_benchmarks()
+        self._setup_kernels()
+
+    def _reset_benchmarks(self):
+        """(Copied from _ProjectionEngine_serial, almost)
+        """
+        self.benchmark.A_Build_aux = 0.
+        self.benchmark.B_Prop = 0.
+        self.benchmark.C_Fourier_update = 0.
+        self.benchmark.D_iProp = 0.
+        self.benchmark.E_Build_exit = 0.
+        self.benchmark.F_LLerror = 0.
+        self.benchmark.wasp_ob_pr_update = 0.
+        self.benchmark.wasp_averaging = 0.
+        self.benchmark.calls_fourier = 0
+        self.benchmark.calls_wasp_ob_pr_update = 0
+        self.benchmark.calls_wasp_averaging = 0
+
+    def _setup_kernels(self):
+        """
+        Setup kernels, one for each scan. Derive scans from ptycho class
+        (Copied from _ProjectionEngine_serial)
+        """
+        # get the scans
+        for label, scan in self.ptycho.model.scans.items():
+
+            kern = u.Param()
+            kern.scanmodel = type(scan).__name__
+            self.kernels[label] = kern
+
+            # TODO: needs to be adapted for broad bandwidth
+            geo = scan.geometries[0]
+
+            # Get info to shape buffer arrays
+            fpc = scan.max_frames_per_block
+
+            # TODO : make this more foolproof
+            try:
+                nmodes = scan.p.coherence.num_probe_modes * \
+                         scan.p.coherence.num_object_modes
+            except:
+                nmodes = 1
+
+            # create buffer arrays
+            ash = (fpc * nmodes,) + tuple(geo.shape)
+            aux = np.zeros(ash, dtype=np.complex64)
+            kern.aux = aux
+
+            # setup kernels, one for each SCAN.
+            kern.FUK = FourierUpdateKernel(aux, nmodes)
+            kern.FUK.allocate()
+
+            kern.POK = PoUpdateKernel()
+            kern.POK.allocate()
+
+            kern.AWK = AuxiliaryWaveKernel()
+            kern.AWK.allocate()
+
+            kern.FW = geo.propagator.fw
+            kern.BW = geo.propagator.bw
+            kern.resolution = geo.resolution[0]
+
+            if self.do_position_refinement:
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution)
+                kern.PCK.allocate()
+
+    def engine_prepare(self):
+        """
+        Last minute initialization.
+
+        Everything that needs to be recalculated when new data arrives.
+        """
+        if self.ptycho.new_data:
+            # recalculate everything
+            mean_power = 0.
+            max_power = 0.
+            for s in self.di.storages.values():
+                mean_power += s.mean_power
+                if s.max_power > max_power:
+                    max_power = s.max_power
+            self.mean_power = mean_power / len(self.di.storages)
+            self.max_power = max_power
+
+        ## Serialize new data ##
+        for label, d in self.ptycho.new_data:
+            prep = u.Param()
+            prep.label = label
+            self.diff_info[d.ID] = prep
+            prep.mag = np.sqrt(np.abs(d.data))
+            prep.ma = self.ma.S[d.ID].data.astype(np.float32)
+            prep.ma_sum = prep.ma.sum(-1).sum(-1)
+            prep.err_phot = np.zeros_like(prep.ma_sum)
+            prep.err_fourier = np.zeros_like(prep.ma_sum)
+            prep.err_exit = np.zeros_like(prep.ma_sum)
+
+        # Unfortunately this needs to be done for all pods, since
+        # the shape of the probe / object was modified.
+        # TODO: possible scaling issue, remove the need for padding
+        for label, d in self.di.storages.items():
+            prep = self.diff_info[d.ID]
+            prep.view_IDs, prep.poe_IDs, prep.addr = projectional_serial.serialize_array_access(d)
+            if self.do_position_refinement:
+                prep.original_addr = np.zeros_like(prep.addr)
+                prep.original_addr[:] = prep.addr
+            pID, oID, eID = prep.poe_IDs
+
+            # Keep a list of view indices
+            prep.rng = np.random.default_rng(self.p.random_seed)
+            prep.vieworder = np.arange(prep.addr.shape[0])
+
+            # Modify addresses, copy pa into ea and remove da/ma
+            prep.addr_ex = np.vstack([prep.addr[:,0,2,0], prep.addr[:,-1,2,0]+1]).T
+            prep.addr[:,:,2] = prep.addr[:,:,0]
+            prep.addr[:,:,3:,0] = 0
+
+            # Reference to ex
+            prep.ex = self.ex.S[eID].data
+
+            # these are the sum for averaging the global object/probe
+            # they are added for each 'successive projection'
+            # nmr and dnm stand for numerator and denominator respectively
+            prep.ob_sum_nmr = np.zeros_like(self.ob.S[oID].data, dtype=np.complex64)
+            prep.ob_sum_dnm = np.zeros_like(self.ob.S[oID].data, dtype=np.float32)
+            prep.pr_sum_nmr = np.zeros_like(self.pr.S[pID].data, dtype=np.complex64)
+            prep.pr_sum_dnm = np.zeros_like(self.pr.S[pID].data, dtype=np.float32)
+
+            # store IDs from all views for shuffling
+            scan_model = self.ptycho.model.scans[prep.label]
+            prep.view_IDs_all = [v.ID for v in scan_model.diff_views]
+            prep.view_IDs_all.sort()
+
+    def engine_iterate(self, num=1):
+        """
+        Compute one iteration.
+        """
+        for it in range(num):
+
+            error_dct = {}
+
+            for dID in self.di.S.keys():
+
+                # find probe, object and exit ID in dependence of dID
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+
+                # references for kernels
+                kern = self.kernels[prep.label]
+                FUK = kern.FUK
+                AWK = kern.AWK
+                POK = kern.POK
+                FW = kern.FW
+                BW = kern.BW
+
+                # global aux buffer
+                aux = kern.aux
+
+                # references for ob, pr
+                ob = self.ob.S[oID].data
+                pr = self.pr.S[pID].data
+
+                # the copy is important to prevent vieworder being modified,
+                # which is always sorted
+                vieworder_all = prep.view_IDs_all.copy()
+                prep.rng.shuffle(vieworder_all)
+
+                # reset the accumulated sum of object/probe before going
+                # through all the diffraction view for this iteration
+                ob_sum_nmr = prep.ob_sum_nmr
+                ob_sum_dnm = prep.ob_sum_dnm
+                pr_sum_nmr = prep.pr_sum_nmr
+                pr_sum_dnm = prep.pr_sum_dnm
+                ob_sum_nmr.fill(0)
+                ob_sum_dnm.fill(0)
+                pr_sum_nmr.fill(0)
+                pr_sum_dnm.fill(0)
+
+                # Iterate through views
+                for vname in vieworder_all:
+                    # only proceed for active view, which is in prep.view_IDs
+                    # for this particular rank
+                    if vname not in prep.view_IDs:
+                        continue
+
+                    # Get local adress and arrays
+                    i = prep.view_IDs.index(vname)
+                    addr = prep.addr[i,None]
+                    ex_from, ex_to = prep.addr_ex[i]
+                    ex = prep.ex[ex_from:ex_to]
+                    mag = prep.mag[i,None]
+                    ma = prep.ma[i,None]
+                    ma_sum = prep.ma_sum[i,None]
+                    err_phot = prep.err_phot[i,None]
+                    err_fourier = prep.err_fourier[i,None]
+                    err_exit = prep.err_exit[i,None]
+
+                    ## build auxilliary wave
+                    t1 = time.time()
+                    AWK.make_aux(aux, addr, ob, pr, ex, c_po=self._c, c_e=1-self._c)
+                    self.benchmark.A_Build_aux += time.time() - t1
+
+                    ## forward FFT
+                    t1 = time.time()
+                    aux[:] = FW(aux)
+                    self.benchmark.B_Prop += time.time() - t1
+
+                    ## Deviation from measured data
+                    t1 = time.time()
+                    if self.p.compute_fourier_error:
+                        FUK.fourier_error(aux, addr, mag, ma, ma_sum)
+                        FUK.error_reduce(addr, err_fourier)
+                    else:
+                        FUK.fourier_deviation(aux, addr, mag)
+                    FUK.fmag_update_nopbound(aux, addr, mag, ma)
+                    self.benchmark.C_Fourier_update += time.time() - t1
+
+                    ## backward FFT
+                    t1 = time.time()
+                    aux[:] = BW(aux)
+                    self.benchmark.D_iProp += time.time() - t1
+
+                    ## build exit wave
+                    t1 = time.time()
+                    AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a+self._b))
+                    if self.p.compute_exit_error:
+                        FUK.exit_error(aux,addr)
+                        FUK.error_reduce(addr, err_exit)
+                    self.benchmark.E_Build_exit += time.time() - t1
+                    self.benchmark.calls_fourier += 1
+
+                    ## build auxilliary wave (ob * pr product)
+                    t1 = time.time()
+                    AWK.build_aux_no_ex(aux, addr, ob, pr)
+                    self.benchmark.A_Build_aux += time.time() - t1
+
+                    # WASP ob and pr local update
+                    t1 = time.time()
+
+                    ob_old = ob.copy()
+                    POK.ob_update_wasp(addr, ob, pr, ex, aux, ob_sum_nmr,
+                                       ob_sum_dnm, alpha=self.p.alpha)
+                    POK.pr_update_wasp(addr, pr, ob_old, ex, aux, pr_sum_nmr,
+                                       pr_sum_dnm, beta=self.p.beta)
+
+                    self.benchmark.wasp_ob_pr_update += time.time() - t1
+                    self.benchmark.calls_wasp_ob_pr_update += 1
+
+                    ## compute log-likelihood
+                    if self.p.compute_log_likelihood:
+                        t1 = time.time()
+                        aux[:] = FW(aux)
+                        FUK.log_likelihood(aux, addr, mag, ma, err_phot)
+                        self.benchmark.F_LLerror += time.time() - t1
+
+                # update errors
+                errs = np.ascontiguousarray(np.vstack([np.hstack(prep.err_fourier),
+                                                       np.hstack(prep.err_phot),
+                                                       np.hstack(prep.err_exit)]).T)
+                error_dct.update(zip(prep.view_IDs, errs))
+
+                # WASP averaging
+                t1 = time.time()
+
+                # collect the sums
+                parallel.allreduce(ob_sum_nmr)
+                parallel.allreduce(ob_sum_dnm)
+                parallel.allreduce(pr_sum_nmr)
+                parallel.allreduce(pr_sum_dnm)
+
+                POK.avg_wasp(ob, ob_sum_nmr, ob_sum_dnm)
+                POK.avg_wasp(pr, pr_sum_nmr, pr_sum_dnm)
+
+                self.benchmark.wasp_averaging += time.time() - t1
+                self.benchmark.calls_wasp_averaging += 1
+
+                # Clip object (This call takes like one ms. Not time critical)
+                if self.p.clip_object is not None:
+                    clip_min, clip_max = self.p.clip_object
+                    ampl_obj = np.abs(ob)
+                    phase_obj = np.exp(1j * np.angle(ob))
+                    too_high = (ampl_obj > clip_max)
+                    too_low = (ampl_obj < clip_min)
+                    ob[too_high] = clip_max * phase_obj[too_high]
+                    ob[too_low] = clip_min * phase_obj[too_low]
+
+            # Re-center the probe
+            self.center_probe()
+
+            # position update
+            self.position_update()
+
+            self.curiter += 1
+
+        #error = parallel.gather_dict(error_dct)
+        return error_dct
+
+    def position_update(self):
+        """
+        Position refinement
+        (Copied from _ProjectionEngine_serial)
+        """
+        if not self.do_position_refinement:
+            return
+        do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
+
+        # Update positions
+        if do_update_pos:
+            """
+            Iterates through all positions and refines them by a given algorithm.
+            """
+            log(4, "----------- START POS REF -------------")
+            for dID in self.di.S.keys():
+
+                prep = self.diff_info[dID]
+                pID, oID, eID = prep.poe_IDs
+                ma = self.ma.S[dID].data
+                ob = self.ob.S[oID].data
+                pr = self.pr.S[pID].data
+                kern = self.kernels[prep.label]
+                aux = kern.aux
+                addr = prep.addr
+                original_addr = prep.original_addr
+                mangled_addr = addr.copy()
+                mag = prep.mag
+                ma_sum = prep.ma_sum
+                err_fourier = prep.err_fourier
+
+                PCK = kern.PCK
+                FW = kern.FW
+
+                # Keep track of object boundaries
+                max_oby = ob.shape[-2] - aux.shape[-2] - 1
+                max_obx = ob.shape[-1] - aux.shape[-1] - 1
+
+                # We need to re-calculate the current error
+                PCK.build_aux(aux, addr, ob, pr)
+                aux[:] = FW(aux)
+                if self.p.position_refinement.metric == "fourier":
+                    PCK.fourier_error(aux, addr, mag, ma, ma_sum)
+                    PCK.error_reduce(addr, err_fourier)
+                if self.p.position_refinement.metric == "photon":
+                    PCK.log_likelihood(aux, addr, mag, ma, err_fourier)
+                error_state = np.zeros_like(err_fourier)
+                error_state[:] = err_fourier
+                PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
+
+                log(4, 'Position refinement trial: iteration %s' % (self.curiter))
+                for i in range(PCK.mangler.nshifts):
+                    PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.build_aux(aux, mangled_addr, ob, pr)
+                    aux[:] = FW(aux)
+                    if self.p.position_refinement.metric == "fourier":
+                        PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
+                        PCK.error_reduce(mangled_addr, err_fourier)
+                    if self.p.position_refinement.metric == "photon":
+                        PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                    PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
+
+                prep.err_fourier = error_state
+                prep.addr = addr
+
+    def engine_finalize(self):
+        """
+        try deleting ever helper contianer
+        """
+        if parallel.master and self.benchmark.calls_fourier:
+            print("----- BENCHMARKS ----")
+            acc = 0.
+            for name in sorted(self.benchmark.keys()):
+                t = self.benchmark[name]
+                if name[0] in 'ABCDEFGHI':
+                    print('%20s : %1.3f ms per iteration' % (name, t / self.benchmark.calls_fourier * 1000))
+                    acc += t
+                elif str(name) == 'wasp_ob_pr_update':
+                    print('%20s : %1.3f ms per call. %d calls' % (
+                        name, t / self.benchmark.calls_wasp_ob_pr_update * 1000, self.benchmark.calls_wasp_ob_pr_update))
+                elif str(name) == 'wasp_averaging':
+                    print('%20s : %1.3f ms per call. %d calls' % (
+                        name, t / self.benchmark.calls_wasp_averaging * 1000, self.benchmark.calls_wasp_averaging))
+
+            print('%20s : %1.3f ms per iteration. %d calls' % (
+                'Fourier_total', acc / self.benchmark.calls_fourier * 1000, self.benchmark.calls_fourier))
+
+        self._reset_benchmarks()
+
+        if self.do_position_refinement:
+            for label, d in self.di.storages.items():
+                prep = self.diff_info[d.ID]
+                res = self.kernels[prep.label].resolution
+                for i,view in enumerate(d.views):
+                    for j,(pname, pod) in enumerate(view.pods.items()):
+                        delta = (prep.original_addr[i][j][1][1:] - prep.addr[i][j][1][1:]) * res
+                        pod.ob_view.coord += delta
+                        pod.ob_view.storage.update_views(pod.ob_view)
+
+        super().engine_finalize()
diff --git a/templates/misc/moonflower_WASP.py b/templates/misc/moonflower_WASP.py
new file mode 100644
index 000000000..76c2ac73b
--- /dev/null
+++ b/templates/misc/moonflower_WASP.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.custom import WASP
+import numpy as np
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# number of probe modes
+p.scans.MF.illumination=u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=2)
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'WASP'
+p.engines.engine00.numiter = 80
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/misc/moonflower_WASP_cupy.py b/templates/misc/moonflower_WASP_cupy.py
new file mode 100644
index 000000000..50dbae611
--- /dev/null
+++ b/templates/misc/moonflower_WASP_cupy.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.custom import WASP_cupy
+import numpy as np
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# number of probe modes
+p.scans.MF.illumination=u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=2)
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'WASP_cupy'
+p.engines.engine00.numiter = 80
+p.engines.engine00.random_seed = 721
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/misc/moonflower_WASP_pycuda.py b/templates/misc/moonflower_WASP_pycuda.py
new file mode 100644
index 000000000..99a1f6205
--- /dev/null
+++ b/templates/misc/moonflower_WASP_pycuda.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.custom import WASP_pycuda
+import numpy as np
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# number of probe modes
+p.scans.MF.illumination=u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=2)
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'WASP_pycuda'
+p.engines.engine00.numiter = 80
+p.engines.engine00.random_seed = 721
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/misc/moonflower_WASP_serial.py b/templates/misc/moonflower_WASP_serial.py
new file mode 100644
index 000000000..360fbb76e
--- /dev/null
+++ b/templates/misc/moonflower_WASP_serial.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+from ptypy.custom import WASP_serial
+import numpy as np
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home = "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# number of probe modes
+p.scans.MF.illumination=u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=2)
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'WASP_serial'
+p.engines.engine00.numiter = 80
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/test/accelerate_tests/base_tests/WASP_tests.py b/test/accelerate_tests/base_tests/WASP_tests.py
new file mode 100644
index 000000000..a0ea4514e
--- /dev/null
+++ b/test/accelerate_tests/base_tests/WASP_tests.py
@@ -0,0 +1,155 @@
+"""
+Test for the WASP engine.
+
+This file is part of the PTYPY package.
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import tempfile
+import shutil
+import unittest
+
+import numpy as np
+
+from test import utils as tu
+from ptypy import utils as u
+from ptypy.custom import WASP, WASP_serial
+from ptypy.utils import parallel
+
+
+class WASPSerialTest(unittest.TestCase):
+
+    def setUp(self):
+        self.outpath = tempfile.mkdtemp(suffix="WASP_serial_test")
+
+    def tearDown(self):
+        shutil.rmtree(self.outpath)
+
+    def check_engine_output(self, output, plotting=False, debug=False):
+        P_WASP, P_WASP_serial = output
+        numiter = len(P_WASP.runtime["iter_info"])
+        LL_WASP = np.array([P_WASP.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        LL_WASP_serial = np.array([P_WASP_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        crop = 42
+        OBJ_WASP_serial, OBJ_WASP = P_WASP_serial.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop], P_WASP.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop]
+        PRB_WASP_serial, PRB_WASP = P_WASP_serial.probe.S["SMFG00"].data[0], P_WASP.probe.S["SMFG00"].data[0]
+        eng_WASP = P_WASP.engines["engine00"]
+        eng_WASP_serial = P_WASP_serial.engines["engine00"]
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.figure("WASP debug")
+            plt.imshow(np.abs(eng_WASP.debug))
+            plt.figure("WASP serial debug")
+            plt.imshow(np.abs(eng_WASP_serial.debug))
+            plt.show()
+
+        if plotting:
+            import matplotlib.pyplot as plt
+            plt.figure("Errors")
+            plt.plot(LL_WASP, label="WASP")
+            plt.plot(LL_WASP_serial, label="WASP_serial")
+            plt.legend()
+            plt.show()
+            plt.figure("Phase WASP")
+            plt.imshow(np.angle(OBJ_WASP))
+            plt.figure("Ampltitude WASP")
+            plt.imshow(np.abs(OBJ_WASP))
+            plt.figure("Phase WASP serial")
+            plt.imshow(np.angle(OBJ_WASP_serial))
+            plt.figure("Amplitude WASP serial")
+            plt.imshow(np.abs(OBJ_WASP_serial))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(OBJ_WASP_serial) - np.angle(OBJ_WASP), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(OBJ_WASP_serial) - np.abs(OBJ_WASP), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+
+            plt.figure("Phase WASP")
+            plt.imshow(np.angle(PRB_WASP))
+            plt.figure("Ampltitude WASP")
+            plt.imshow(np.abs(PRB_WASP))
+            plt.figure("Phase WASP serial")
+            plt.imshow(np.angle(PRB_WASP_serial))
+            plt.figure("Amplitude WASP serial")
+            plt.imshow(np.abs(PRB_WASP_serial))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(PRB_WASP_serial) - np.angle(PRB_WASP), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(PRB_WASP_serial) - np.abs(PRB_WASP), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+
+        RMSE_ob = (np.mean(np.abs(OBJ_WASP_serial - OBJ_WASP)**2))
+        RMSE_pr = (np.mean(np.abs(PRB_WASP_serial - PRB_WASP)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-1,
+                                    err_msg="The object arrays are not matching as expected")
+
+        # the extremly high tolerance for probe is a result of precision
+        # difference between the normal and serial version, which is
+        # deliberate(?) to match GPU version
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e3,
+                                    err_msg="The probe arrays are not matching as expected")
+
+    def test_WASP_serial_base(self):
+        out = []
+        for eng in ["WASP", "WASP_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_serial_clip(self):
+        out = []
+        for eng in ["WASP", "WASP_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_serial_alpha_beta(self):
+        out = []
+        for eng in ["WASP", "WASP_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.alpha = 0.64
+            engine_params.beta = 0.94
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_serial_all(self):
+        out = []
+        for eng in ["WASP", "WASP_serial"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.alpha = 0.64
+            engine_params.beta = 0.94
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py b/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
new file mode 100644
index 000000000..7ccda4930
--- /dev/null
+++ b/test/accelerate_tests/cuda_cupy_tests/WASP_tests.py
@@ -0,0 +1,154 @@
+"""
+Test for the WASP engine.
+
+This file is part of the PTYPY package.
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import tempfile
+import shutil
+import unittest
+
+import numpy as np
+
+from test import utils as tu
+from ptypy import utils as u
+from ptypy.custom import WASP_serial, WASP_cupy
+from ptypy.utils import parallel
+
+
+class WASPCupyTest(unittest.TestCase):
+
+    def setUp(self):
+        self.outpath = tempfile.mkdtemp(suffix="WASP_cupy_test")
+
+    def tearDown(self):
+        shutil.rmtree(self.outpath)
+
+    def check_engine_output(self, output, plotting=False, debug=False):
+        P_WASP_serial, P_WASP_cupy = output
+        numiter = len(P_WASP_serial.runtime["iter_info"])
+        LL_WASP_serial = np.array([P_WASP_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        LL_WASP_cupy = np.array([P_WASP_cupy.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        crop = 42
+        OBJ_WASP_cupy, OBJ_WASP_serial = P_WASP_cupy.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop], P_WASP_serial.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop]
+        PRB_WASP_cupy, PRB_WASP_serial = P_WASP_cupy.probe.S["SMFG00"].data[0], P_WASP_serial.probe.S["SMFG00"].data[0]
+        eng_WASP_serial = P_WASP_serial.engines["engine00"]
+        eng_WASP_cupy = P_WASP_cupy.engines["engine00"]
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.figure("WASP serial debug")
+            plt.imshow(np.abs(eng_WASP_serial.debug))
+            plt.figure("WASP cupy debug")
+            plt.imshow(np.abs(eng_WASP_cupy.debug))
+            plt.show()
+
+        if plotting:
+            import matplotlib.pyplot as plt
+            plt.figure("Errors")
+            plt.plot(LL_WASP_serial, label="WASP_serial")
+            plt.plot(LL_WASP_cupy, label="WASP_cupy")
+            plt.legend()
+            plt.show()
+            plt.figure("Phase WASP serial")
+            plt.imshow(np.angle(OBJ_WASP_serial))
+            plt.figure("Ampltitude WASP serial")
+            plt.imshow(np.abs(OBJ_WASP_serial))
+            plt.figure("Phase WASP cupy")
+            plt.imshow(np.angle(OBJ_WASP_cupy))
+            plt.figure("Amplitude WASP cupy")
+            plt.imshow(np.abs(OBJ_WASP_cupy))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(OBJ_WASP_cupy) - np.angle(OBJ_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(OBJ_WASP_cupy) - np.abs(OBJ_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+
+            plt.figure("Phase WASP serial")
+            plt.imshow(np.angle(PRB_WASP_serial))
+            plt.figure("Ampltitude WASP serial")
+            plt.imshow(np.abs(PRB_WASP_serial))
+            plt.figure("Phase WASP cupy")
+            plt.imshow(np.angle(PRB_WASP_cupy))
+            plt.figure("Amplitude WASP cupy")
+            plt.imshow(np.abs(PRB_WASP_cupy))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(PRB_WASP_cupy) - np.angle(PRB_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(PRB_WASP_cupy) - np.abs(PRB_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+
+        RMSE_ob = (np.mean(np.abs(OBJ_WASP_cupy - OBJ_WASP_serial)**2))
+        RMSE_pr = (np.mean(np.abs(PRB_WASP_cupy - PRB_WASP_serial)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-1,
+                                    err_msg="The object arrays are not matching as expected")
+
+        # the extremly high tolerance for probe is a result of precision
+        # difference between the serial and cupy version
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e3,
+                                    err_msg="The probe arrays are not matching as expected")
+
+    def test_WASP_cupy_base(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_cupy_clip(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_cupy_alpha_beta(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.alpha = 0.64
+            engine_params.beta = 0.94
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_cupy_all(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_cupy"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.alpha = 0.64
+            engine_params.beta = 0.94
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
index 8a41bad35..5b2751877 100644
--- a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py
@@ -76,17 +76,17 @@ def prepare_arrays(self, scan_points=None):
 
         object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
         for idx in range(G):
-            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2)
 
         probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
         for idx in range(D):
-            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2)
 
-        return (cp.asarray(addr), 
-            cp.asarray(object_array), 
-            cp.asarray(object_array_denominator), 
-            cp.asarray(probe), 
-            cp.asarray(exit_wave), 
+        return (cp.asarray(addr),
+            cp.asarray(object_array),
+            cp.asarray(object_array_denominator),
+            cp.asarray(probe),
+            cp.asarray(exit_wave),
             cp.asarray(probe_denominator))
 
 
@@ -96,7 +96,7 @@ def test_init(self):
                                 err_msg='PoUpdateKernel does not have the correct functions registered.')
 
     def ob_update_REGRESSION_tester(self, atomics=True):
-        
+
         B = 5  # frame size y
         C = 5  # frame size x
 
@@ -291,7 +291,7 @@ def ob_update_UNITY_tester(self, atomics=True):
         '''
         object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
         for idx in range(G):
-            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2)
 
 
         POUK = PoUpdateKernel()
@@ -330,7 +330,7 @@ def ob_update_UNITY_tester(self, atomics=True):
 
     def test_ob_update_atomics_UNITY(self):
         self.ob_update_UNITY_tester(atomics=True)
-    
+
     def test_ob_update_tiled_UNITY(self):
         self.ob_update_UNITY_tester(atomics=False)
 
@@ -394,7 +394,7 @@ def pr_update_REGRESSION_tester(self, atomics=True):
         '''
         probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
         for idx in range(D):
-            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2)
 
         POUK = PoUpdateKernel()
 
@@ -929,7 +929,7 @@ def test_pr_norm_local_UNITY(self):
         POUK = PoUpdateKernel()
 
         probe_dev = cp.asarray(probe)
-        probe_norm_dev = cp.asarray(probe_norm) 
+        probe_norm_dev = cp.asarray(probe_norm)
         addr_dev = cp.asarray(addr)
 
         POUK.pr_norm_local(addr_dev,  probe_dev, probe_norm_dev)
@@ -938,6 +938,226 @@ def test_pr_norm_local_UNITY(self):
         np.testing.assert_allclose(probe_norm_dev.get(), probe_norm, rtol=1e-6, atol=1e-6,
                                       err_msg="The probe norm has not been updated as expected")
 
+    def test_ob_update_wasp_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 1.5
+
+        ob_sum_nmr = np.zeros(shape=(G,H,I), dtype=COMPLEX_TYPE)
+        ob_sum_dnm = np.zeros(shape=(G,H,I), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = cp.asarray(object_array)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        auxiliary_wave_dev = cp.asarray(auxiliary_wave)
+        ob_sum_nmr_dev = cp.asarray(ob_sum_nmr)
+        ob_sum_dnm_dev = cp.asarray(ob_sum_dnm)
+        addr_dev = cp.asarray(addr)
+
+        POUK.ob_update_wasp(addr_dev, object_array_dev, probe_dev,
+                            exit_wave_dev, auxiliary_wave_dev, ob_sum_nmr_dev,
+                            ob_sum_dnm_dev)
+
+        nPOUK.ob_update_wasp(addr, object_array, probe, exit_wave,
+                             auxiliary_wave, ob_sum_nmr, ob_sum_dnm)
+
+        np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object has not been updated as expected")
+        np.testing.assert_allclose(ob_sum_nmr_dev.get(), ob_sum_nmr, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object sum numerator has not been updated as expected")
+        np.testing.assert_allclose(ob_sum_dnm_dev.get(), ob_sum_dnm, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object sum denominator has not been updated as expected")
+
+    def test_pr_update_wasp_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 1.5
+
+        pr_sum_nmr = np.zeros(shape=(D,E,F), dtype=COMPLEX_TYPE)
+        pr_sum_dnm = np.zeros(shape=(D,E,F), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = cp.asarray(object_array)
+        probe_dev = cp.asarray(probe)
+        exit_wave_dev = cp.asarray(exit_wave)
+        auxiliary_wave_dev = cp.asarray(auxiliary_wave)
+        pr_sum_nmr_dev = cp.asarray(pr_sum_nmr)
+        pr_sum_dnm_dev = cp.asarray(pr_sum_dnm)
+        addr_dev = cp.asarray(addr)
+
+        POUK.pr_update_wasp(addr_dev, probe_dev, object_array_dev,
+                            exit_wave_dev, auxiliary_wave_dev, pr_sum_nmr_dev,
+                            pr_sum_dnm_dev)
+
+        nPOUK.pr_update_wasp(addr, probe, object_array, exit_wave,
+                             auxiliary_wave, pr_sum_nmr, pr_sum_dnm)
+
+        np.testing.assert_allclose(probe_dev.get(), probe, rtol=1e-6, atol=1e-6,
+                                   err_msg="The probe has not been updated as expected")
+        np.testing.assert_allclose(pr_sum_nmr_dev.get(), pr_sum_nmr, rtol=1e-6, atol=1e-6,
+                                   err_msg="The probe sum numerator has not been updated as expected")
+        np.testing.assert_allclose(pr_sum_dnm_dev.get(), pr_sum_dnm, rtol=1e-6, atol=1e-6,
+                                   err_msg="The probe sum denominator has not been updated as expected")
+
+    def test_avg_wasp_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        ob_sum_nmr = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            ob_sum_nmr[idx] = np.arange(idx, idx+H*I).reshape(H,I) + 1j * np.arange(idx, idx+H*I).reshape(H,I)
+
+        ob_sum_dnm = ob_sum_nmr[:, ::-1, ::-1].copy().real
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = cp.asarray(object_array)
+        ob_sum_nmr_dev = cp.asarray(ob_sum_nmr)
+        ob_sum_dnm_dev = cp.asarray(ob_sum_dnm)
+
+        POUK.avg_wasp(object_array_dev, ob_sum_nmr_dev, ob_sum_dnm_dev)
+        nPOUK.avg_wasp(object_array, ob_sum_nmr, ob_sum_dnm)
+
+        np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object_array has not been updated as expected")
+        np.testing.assert_allclose(ob_sum_nmr_dev.get(), ob_sum_nmr, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object_array sum numerator should be unchanged")
+        np.testing.assert_allclose(ob_sum_dnm_dev.get(), ob_sum_dnm, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object_array sum denominator should be unchanged")
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py b/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
new file mode 100644
index 000000000..f8bf5f0c8
--- /dev/null
+++ b/test/accelerate_tests/cuda_pycuda_tests/WASP_tests.py
@@ -0,0 +1,154 @@
+"""
+Test for the WASP engine.
+
+This file is part of the PTYPY package.
+    :copyright: Copyright 2014 by the PTYPY team, see AUTHORS.
+    :license: see LICENSE for details.
+"""
+import tempfile
+import shutil
+import unittest
+
+import numpy as np
+
+from test import utils as tu
+from ptypy import utils as u
+from ptypy.custom import WASP_serial, WASP_pycuda
+from ptypy.utils import parallel
+
+
+class WASPPycudaTest(unittest.TestCase):
+
+    def setUp(self):
+        self.outpath = tempfile.mkdtemp(suffix="WASP_pycuda_test")
+
+    def tearDown(self):
+        shutil.rmtree(self.outpath)
+
+    def check_engine_output(self, output, plotting=False, debug=False):
+        P_WASP_serial, P_WASP_pycuda = output
+        numiter = len(P_WASP_serial.runtime["iter_info"])
+        LL_WASP_serial = np.array([P_WASP_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        LL_WASP_pycuda = np.array([P_WASP_pycuda.runtime["iter_info"][i]["error"][1] for i in range(numiter)])
+        crop = 42
+        OBJ_WASP_pycuda, OBJ_WASP_serial = P_WASP_pycuda.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop], P_WASP_serial.obj.S["SMFG00"].data[0,crop:-crop,crop:-crop]
+        PRB_WASP_pycuda, PRB_WASP_serial = P_WASP_pycuda.probe.S["SMFG00"].data[0], P_WASP_serial.probe.S["SMFG00"].data[0]
+        eng_WASP_serial = P_WASP_serial.engines["engine00"]
+        eng_WASP_pycuda = P_WASP_pycuda.engines["engine00"]
+        if debug:
+            import matplotlib.pyplot as plt
+            plt.figure("WASP serial debug")
+            plt.imshow(np.abs(eng_WASP_serial.debug))
+            plt.figure("WASP pycuda debug")
+            plt.imshow(np.abs(eng_WASP_pycuda.debug))
+            plt.show()
+
+        if plotting:
+            import matplotlib.pyplot as plt
+            plt.figure("Errors")
+            plt.plot(LL_WASP_serial, label="WASP_serial")
+            plt.plot(LL_WASP_pycuda, label="WASP_pycuda")
+            plt.legend()
+            plt.show()
+            plt.figure("Phase WASP serial")
+            plt.imshow(np.angle(OBJ_WASP_serial))
+            plt.figure("Ampltitude WASP serial")
+            plt.imshow(np.abs(OBJ_WASP_serial))
+            plt.figure("Phase WASP pycuda")
+            plt.imshow(np.angle(OBJ_WASP_pycuda))
+            plt.figure("Amplitude WASP pycuda")
+            plt.imshow(np.abs(OBJ_WASP_pycuda))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(OBJ_WASP_pycuda) - np.angle(OBJ_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(OBJ_WASP_pycuda) - np.abs(OBJ_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+
+            plt.figure("Phase WASP serial")
+            plt.imshow(np.angle(PRB_WASP_serial))
+            plt.figure("Ampltitude WASP serial")
+            plt.imshow(np.abs(PRB_WASP_serial))
+            plt.figure("Phase WASP pycuda")
+            plt.imshow(np.angle(PRB_WASP_pycuda))
+            plt.figure("Amplitude WASP pycuda")
+            plt.imshow(np.abs(PRB_WASP_pycuda))
+            plt.figure("Phase difference")
+            plt.imshow(np.angle(PRB_WASP_pycuda) - np.angle(PRB_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.figure("Amplitude difference")
+            plt.imshow(np.abs(PRB_WASP_pycuda) - np.abs(PRB_WASP_serial), vmin=-0.1, vmax=0.1)
+            plt.colorbar()
+            plt.show()
+
+        RMSE_ob = (np.mean(np.abs(OBJ_WASP_pycuda - OBJ_WASP_serial)**2))
+        RMSE_pr = (np.mean(np.abs(PRB_WASP_pycuda - PRB_WASP_serial)**2))
+        np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-1,
+                                    err_msg="The object arrays are not matching as expected")
+
+        # the extremly high tolerance for probe is a result of precision
+        # difference between the serial and pycuda version
+        np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e3,
+                                    err_msg="The probe arrays are not matching as expected")
+
+    def test_WASP_pycuda_base(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_pycuda_clip(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_pycuda_alpha_beta(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.alpha = 0.64
+            engine_params.beta = 0.94
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+    def test_WASP_pycuda_all(self):
+        out = []
+        for eng in ["WASP_serial", "WASP_pycuda"]:
+            engine_params = u.Param()
+            engine_params.name = eng
+            engine_params.numiter = 10
+            engine_params.clip_object = (0, 2)
+            engine_params.alpha = 0.64
+            engine_params.beta = 0.94
+            engine_params.random_seed = 721
+            out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True,
+                                           scanmodel="BlockFull", autosave=False, verbose_level="critical"))
+
+        if parallel.master:
+            self.check_engine_output(out, plotting=False, debug=False)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
index 27c6abb56..5fc1112cd 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/po_update_kernel_test.py
@@ -76,17 +76,17 @@ def prepare_arrays(self, scan_points=None):
 
         object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
         for idx in range(G):
-            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2)
 
         probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
         for idx in range(D):
-            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2)
 
-        return (gpuarray.to_gpu(addr), 
-            gpuarray.to_gpu(object_array), 
-            gpuarray.to_gpu(object_array_denominator), 
-            gpuarray.to_gpu(probe), 
-            gpuarray.to_gpu(exit_wave), 
+        return (gpuarray.to_gpu(addr),
+            gpuarray.to_gpu(object_array),
+            gpuarray.to_gpu(object_array_denominator),
+            gpuarray.to_gpu(probe),
+            gpuarray.to_gpu(exit_wave),
             gpuarray.to_gpu(probe_denominator))
 
 
@@ -96,7 +96,7 @@ def test_init(self):
                                 err_msg='PoUpdateKernel does not have the correct functions registered.')
 
     def ob_update_REGRESSION_tester(self, atomics=True):
-        
+
         B = 5  # frame size y
         C = 5  # frame size x
 
@@ -291,7 +291,7 @@ def ob_update_UNITY_tester(self, atomics=True):
         '''
         object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE)
         for idx in range(G):
-            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) 
+            object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2)
 
 
         POUK = PoUpdateKernel()
@@ -330,7 +330,7 @@ def ob_update_UNITY_tester(self, atomics=True):
 
     def test_ob_update_atomics_UNITY(self):
         self.ob_update_UNITY_tester(atomics=True)
-    
+
     def test_ob_update_tiled_UNITY(self):
         self.ob_update_UNITY_tester(atomics=False)
 
@@ -394,7 +394,7 @@ def pr_update_REGRESSION_tester(self, atomics=True):
         '''
         probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE)
         for idx in range(D):
-            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) 
+            probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2)
 
         POUK = PoUpdateKernel()
 
@@ -929,7 +929,7 @@ def test_pr_norm_local_UNITY(self):
         POUK = PoUpdateKernel()
 
         probe_dev = gpuarray.to_gpu(probe)
-        probe_norm_dev = gpuarray.to_gpu(probe_norm) 
+        probe_norm_dev = gpuarray.to_gpu(probe_norm)
         addr_dev = gpuarray.to_gpu(addr)
 
         POUK.pr_norm_local(addr_dev,  probe_dev, probe_norm_dev)
@@ -938,6 +938,226 @@ def test_pr_norm_local_UNITY(self):
         np.testing.assert_allclose(probe_norm_dev.get(), probe_norm, rtol=1e-6, atol=1e-6,
                                       err_msg="The probe norm has not been updated as expected")
 
+    def test_ob_update_wasp_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 1.5
+
+        ob_sum_nmr = np.zeros(shape=(G,H,I), dtype=COMPLEX_TYPE)
+        ob_sum_dnm = np.zeros(shape=(G,H,I), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = gpuarray.to_gpu(object_array)
+        probe_dev = gpuarray.to_gpu(probe)
+        exit_wave_dev = gpuarray.to_gpu(exit_wave)
+        auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave)
+        ob_sum_nmr_dev = gpuarray.to_gpu(ob_sum_nmr)
+        ob_sum_dnm_dev = gpuarray.to_gpu(ob_sum_dnm)
+        addr_dev = gpuarray.to_gpu(addr)
+
+        POUK.ob_update_wasp(addr_dev, object_array_dev, probe_dev,
+                            exit_wave_dev, auxiliary_wave_dev, ob_sum_nmr_dev,
+                            ob_sum_dnm_dev)
+
+        nPOUK.ob_update_wasp(addr, object_array, probe, exit_wave,
+                             auxiliary_wave, ob_sum_nmr, ob_sum_dnm)
+
+        np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object has not been updated as expected")
+        np.testing.assert_allclose(ob_sum_nmr_dev.get(), ob_sum_nmr, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object sum numerator has not been updated as expected")
+        np.testing.assert_allclose(ob_sum_dnm_dev.get(), ob_sum_dnm, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object sum denominator has not been updated as expected")
+
+    def test_pr_update_wasp_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        D = 2  # number of probe modes
+        E = B  # probe size y
+        F = C  # probe size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        scan_pts = 1  # one dimensional scan point number
+
+        total_number_scan_positions = scan_pts ** 2
+        total_number_modes = G * D
+        A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes
+
+        probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE)
+        for idx in range(D):
+            probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1)
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE)
+        for idx in range(A):
+            exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1)
+        auxiliary_wave = exit_wave.copy() * 1.5
+
+        pr_sum_nmr = np.zeros(shape=(D,E,F), dtype=COMPLEX_TYPE)
+        pr_sum_dnm = np.zeros(shape=(D,E,F), dtype=FLOAT_TYPE)
+
+        X, Y = np.meshgrid(range(scan_pts), range(scan_pts))
+        X = X.reshape((total_number_scan_positions))
+        Y = Y.reshape((total_number_scan_positions))
+
+        addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE)
+
+        exit_idx = 0
+        position_idx = 0
+        for xpos, ypos in zip(X, Y):#
+            mode_idx = 0
+            for pr_mode in range(D):
+                for ob_mode in range(G):
+                    addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0],
+                                                             [ob_mode, ypos, xpos],
+                                                             [exit_idx, 0, 0],
+                                                             [0, 0, 0],
+                                                             [0, 0, 0]], dtype=INT_TYPE)
+                    mode_idx += 1
+                    exit_idx += 1
+            position_idx += 1
+
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = gpuarray.to_gpu(object_array)
+        probe_dev = gpuarray.to_gpu(probe)
+        exit_wave_dev = gpuarray.to_gpu(exit_wave)
+        auxiliary_wave_dev = gpuarray.to_gpu(auxiliary_wave)
+        pr_sum_nmr_dev = gpuarray.to_gpu(pr_sum_nmr)
+        pr_sum_dnm_dev = gpuarray.to_gpu(pr_sum_dnm)
+        addr_dev = gpuarray.to_gpu(addr)
+
+        POUK.pr_update_wasp(addr_dev, probe_dev, object_array_dev,
+                            exit_wave_dev, auxiliary_wave_dev, pr_sum_nmr_dev,
+                            pr_sum_dnm_dev)
+
+        nPOUK.pr_update_wasp(addr, probe, object_array, exit_wave,
+                             auxiliary_wave, pr_sum_nmr, pr_sum_dnm)
+
+        np.testing.assert_allclose(probe_dev.get(), probe, rtol=1e-6, atol=1e-6,
+                                   err_msg="The probe has not been updated as expected")
+        np.testing.assert_allclose(pr_sum_nmr_dev.get(), pr_sum_nmr, rtol=1e-6, atol=1e-6,
+                                   err_msg="The probe sum numerator has not been updated as expected")
+        np.testing.assert_allclose(pr_sum_dnm_dev.get(), pr_sum_dnm, rtol=1e-6, atol=1e-6,
+                                   err_msg="The probe sum denominator has not been updated as expected")
+
+    def test_avg_wasp_UNITY(self):
+        '''
+        setup
+        '''
+        B = 5  # frame size y
+        C = 5  # frame size x
+
+        npts_greater_than = 2  # how many points bigger than the probe the object is.
+        G = 2  # number of object modes
+        H = B + npts_greater_than  #  object size y
+        I = C + npts_greater_than  #  object size x
+
+        object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1)
+
+        ob_sum_nmr = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE)
+        for idx in range(G):
+            ob_sum_nmr[idx] = np.arange(idx, idx+H*I).reshape(H,I) + 1j * np.arange(idx, idx+H*I).reshape(H,I)
+
+        ob_sum_dnm = ob_sum_nmr[:, ::-1, ::-1].copy().real
+
+        '''
+        test
+        '''
+        from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel
+        nPOUK = npPoUpdateKernel()
+        POUK = PoUpdateKernel()
+
+        object_array_dev = gpuarray.to_gpu(object_array)
+        ob_sum_nmr_dev = gpuarray.to_gpu(ob_sum_nmr)
+        ob_sum_dnm_dev = gpuarray.to_gpu(ob_sum_dnm)
+
+        POUK.avg_wasp(object_array_dev, ob_sum_nmr_dev, ob_sum_dnm_dev)
+        nPOUK.avg_wasp(object_array, ob_sum_nmr, ob_sum_dnm)
+
+        np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object_array has not been updated as expected")
+        np.testing.assert_allclose(ob_sum_nmr_dev.get(), ob_sum_nmr, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object_array sum numerator should be unchanged")
+        np.testing.assert_allclose(ob_sum_dnm_dev.get(), ob_sum_dnm, rtol=1e-6, atol=1e-6,
+                                   err_msg="The object_array sum denominator should be unchanged")
 
 if __name__ == '__main__':
     unittest.main()

From db8b26c2582285518c791cd2e8edc9cf94c029b0 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 2 Feb 2024 16:15:37 +0000
Subject: [PATCH 34/37] Add new option to provide arbitrary order of frames
 (#526)

* Add new option to provide arbitrary order of frames

* Add logging

* remove debugging traces
---
 ptypy/experiment/hdf5_loader.py        |  29 ++-
 test/ptyscan_tests/hdf5_loader_test.py | 254 +++++++++++++++++++++++++
 2 files changed, 282 insertions(+), 1 deletion(-)

diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py
index 44344477f..ac54caa81 100644
--- a/ptypy/experiment/hdf5_loader.py
+++ b/ptypy/experiment/hdf5_loader.py
@@ -193,7 +193,7 @@ class Hdf5Loader(PtyScan):
     default =
     type = Param
     help = Parameters for the filtering of frames
-    doc = The shape of loaded data is assumed to hvae the same dimensionality as data.shape[:-2]
+    doc = The shape of loaded data is assumed to have the same dimensionality as data.shape[:-2]
 
     [framefilter.file]
     default = None
@@ -298,6 +298,18 @@ class Hdf5Loader(PtyScan):
     help = Switch for loading data from electron ptychography experiments.
     doc = If True, the energy provided in keV will be considered as electron energy
           and converted to electron wavelengths.
+
+    [frameorder]
+    default =
+    type = Param
+    help = Parameters for the re-ordering of frames
+    doc = The shape of loaded array of indices is matching the dimensionality of the loaded intensity
+
+    [frameorder.indices]
+    default = None
+    type = list, ndarray
+    help = This is the array or list with the re-ordered indices.
+
     """
 
     def __init__(self, pars=None, swmr=False, **kwargs):
@@ -596,6 +608,16 @@ def _prepare_center(self):
             log(3, "center is %s, auto_center: %s" % (self.info.center, self.info.auto_center))
             log(3, "The loader will not do any cropping.")
 
+    def _reorder_preview_indices(self):
+        if self.p.frameorder.indices is None:
+            return
+        order = np.array(self.p.frameorder.indices, dtype=int)
+        if (order.max() > self.preview_indices.shape[-1]):
+            log(3, "Given frameorder does not match dimensionality of data, keeping the original order")
+            return
+        log(3, "Reordering indices")
+        self.preview_indices = self.preview_indices.T[order].T
+
     def load_unmapped_raster_scan(self, indices):
         intensities = {}
         positions = {}
@@ -732,6 +754,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
                 self.preview_indices = np.array([indices[1][::skip,::skip].flatten(), indices[0][::skip,::skip].flatten()], dtype=int)
                 if self.framefilter is not None:
                     self.preview_indices = self.preview_indices[:,self.framefilter[indices[1][::skip,::skip], indices[0][::skip,::skip]].flatten()]
+                self._reorder_preview_indices()
                 self.num_frames = len(self.preview_indices[0])
 
             else:
@@ -748,6 +771,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
                 self.preview_indices = indices[::skip]
                 if self.framefilter is not None:
                     self.preview_indices = self.preview_indices[self.framefilter[indices][::skip]]
+                self._reorder_preview_indices()
                 self.num_frames = len(self.preview_indices)
 
         elif ((len(positions_fast_shape)>1) and (len(positions_slow_shape)>1)) and data_shape[0] == np.prod(positions_fast_shape) == np.prod(positions_slow_shape):
@@ -776,6 +800,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
             self.preview_indices = np.array([indices[1][::skip,::skip].flatten(), indices[0][::skip,::skip].flatten()])
             if self.framefilter:
                 log(3, "Framefilter not supported for this case")
+            self._reorder_preview_indices()
             self.num_frames = len(self.preview_indices[0])
             self._ismapped = False
             self._scantype = 'raster'
@@ -809,6 +834,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
                 self.preview_indices = np.array([indices[1][::skip,::skip].flatten(), indices[0][::skip,::skip].flatten()], dtype=int)
                 if self.framefilter:
                     log(3, "Framefilter not supported for this case")
+                self._reorder_preview_indices()
                 self.num_frames = len(self.preview_indices[0])
                 self._ismapped = True
                 self._scantype = 'raster'
@@ -840,6 +866,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape,
                 self.preview_indices = np.array([indices[1][::skip,::skip].flatten(), indices[0][::skip,::skip].flatten()], dtype=int)
                 if self.framefilter:
                     log(3, "Framefilter not supported for this case")
+                self._reorder_preview_indices()
                 self.num_frames = len(self.preview_indices[0])
                 self._ismapped = False
                 self._scantype = 'raster'
diff --git a/test/ptyscan_tests/hdf5_loader_test.py b/test/ptyscan_tests/hdf5_loader_test.py
index cdda48bf0..57e1a3933 100644
--- a/test/ptyscan_tests/hdf5_loader_test.py
+++ b/test/ptyscan_tests/hdf5_loader_test.py
@@ -325,6 +325,106 @@ def test_position_data_mapping_case_1_with_framefilter(self):
         np.testing.assert_equal(out_data.shape, ground_truth.shape, err_msg="The shapes don't match for the positions for case 1 with framefilter")
         np.testing.assert_array_equal(out_data, ground_truth, err_msg='There is something up with the positions for case 1 with framefilter')
 
+    def test_position_data_mapping_case_1_with_frameorder_1(self):
+        '''
+        axis_data.shape (A, B) for data.shape (A, B, frame_size_m, frame_size_n),
+        '''
+        A = 106
+        B = 101
+        frame_size_m = 5
+        frame_size_n = 5
+
+        positions_slow = np.arange(A)
+        positions_fast = np.arange(B)
+        fast, slow = np.meshgrid(positions_fast, positions_slow) # just pretend it's a simple grid
+        fast = fast[..., np.newaxis, np.newaxis]
+        slow = slow[..., np.newaxis, np.newaxis]
+        # now chuck them in the files
+        with h5.File(self.positions_file, 'w') as f:
+            f[self.positions_slow_key] = slow
+            f[self.positions_fast_key] = fast
+
+        # make up some data ...
+        data = np.arange(A*B*frame_size_m*frame_size_n).reshape(A, B, frame_size_m, frame_size_n)
+        with h5.File(self.intensity_file, 'w') as f:
+            f[self.intensity_key] = data
+
+        # create frameorder array of indices
+        frameorder = np.arange(A*B)
+        np.random.shuffle(frameorder)
+
+        data_params = u.Param()
+        data_params.auto_center = False
+        data_params.intensities = u.Param()
+        data_params.intensities.file = self.intensity_file
+        data_params.intensities.key = self.intensity_key
+
+        data_params.positions = u.Param()
+        data_params.positions.file = self.positions_file
+        data_params.positions.slow_key = self.positions_slow_key
+        data_params.positions.fast_key = self.positions_fast_key
+
+        data_params.frameorder = u.Param()
+        data_params.frameorder.indices = frameorder
+        output = PtyscanTestRunner(Hdf5Loader, data_params, auto_frames=A*B, cleanup=False)
+
+        with h5.File(output['output_file'],'r') as f:
+            out_data = f['chunks/0/data'][...].squeeze()
+        ground_truth = data.reshape((-1, frame_size_m, frame_size_n))[frameorder]
+        np.testing.assert_equal(out_data.shape, ground_truth.shape, err_msg="The shapes don't match for the positions for case 1 with different frameorder")
+        np.testing.assert_array_equal(out_data, ground_truth, err_msg='There is something up with the positions for case 1 with different frameorder')
+
+
+    def test_position_data_mapping_case_1_with_frameorder_2(self):
+        '''
+        axis_data.shape (A, B) for data.shape (A, B, frame_size_m, frame_size_n),
+        '''
+        A = 106
+        B = 101
+        frame_size_m = 5
+        frame_size_n = 5
+
+        positions_slow = np.arange(A)
+        positions_fast = np.arange(B)
+        fast, slow = np.meshgrid(positions_fast, positions_slow) # just pretend it's a simple grid
+        fast = fast[..., np.newaxis, np.newaxis]
+        slow = slow[..., np.newaxis, np.newaxis]
+        # now chuck them in the files
+        with h5.File(self.positions_file, 'w') as f:
+            f[self.positions_slow_key] = slow
+            f[self.positions_fast_key] = fast
+
+        # make up some data ...
+        data = np.arange(A*B*frame_size_m*frame_size_n).reshape(A, B, frame_size_m, frame_size_n)
+        with h5.File(self.intensity_file, 'w') as f:
+            f[self.intensity_key] = data
+
+        # create frameorder array of indices
+        frameorder = np.hstack([np.arange(A*B), np.random.randint(A*B, size=int(0.1*A*B))])
+        np.random.shuffle(frameorder)
+
+        data_params = u.Param()
+        data_params.auto_center = False
+        data_params.intensities = u.Param()
+        data_params.intensities.file = self.intensity_file
+        data_params.intensities.key = self.intensity_key
+
+        data_params.positions = u.Param()
+        data_params.positions.file = self.positions_file
+        data_params.positions.slow_key = self.positions_slow_key
+        data_params.positions.fast_key = self.positions_fast_key
+
+        data_params.frameorder = u.Param()
+        data_params.frameorder.indices = frameorder
+        output = PtyscanTestRunner(Hdf5Loader, data_params, auto_frames=len(frameorder), cleanup=False)
+
+        with h5.File(output['output_file'],'r') as f:
+            out_data = f['chunks/0/data'][...].squeeze()
+        ground_truth = data.reshape((-1, frame_size_m, frame_size_n))[frameorder]
+        np.testing.assert_equal(out_data.shape, ground_truth.shape, err_msg="The shapes don't match for the positions for case 1 with different frameorder")
+        np.testing.assert_array_equal(out_data, ground_truth, err_msg='There is something up with the positions for case 1 with different frameorder')
+
+
     def test_darkfield_applied_case_1(self):
         '''
         Applies the darkfield and assumes it is shaped like the data
@@ -600,6 +700,57 @@ def test_position_data_mapping_case_2_with_framefilter(self):
                                       err_msg='There is something up with the positions for case 2 with framefilter')
 
 
+    def test_position_data_mapping_case_2_with_frameorder(self):
+        '''
+        axis_data.shape (k,) for data.shape (k, frame_size_m, frame_size_n)
+        '''
+        k = 12
+        frame_size_m = 5
+        frame_size_n = 5
+
+        positions_slow = np.arange(k)
+        positions_fast = np.arange(k)
+
+        # now chuck them in the files
+        with h5.File(self.positions_file, 'w') as f:
+            f[self.positions_slow_key] = positions_slow
+            f[self.positions_fast_key] = positions_fast
+
+        # make up some data ...
+        data = np.arange(k*frame_size_m*frame_size_n).reshape(k, frame_size_m, frame_size_n)
+        with h5.File(self.intensity_file, 'w') as f:
+            f[self.intensity_key] = data
+
+        # create frameorder array of indices
+        frameorder = np.hstack([np.arange(k), np.random.randint(k, size=int(0.1*k))])
+        np.random.shuffle(frameorder)
+
+        data_params = u.Param()
+        data_params.auto_center = False
+        data_params.intensities = u.Param()
+        data_params.intensities.file = self.intensity_file
+        data_params.intensities.key = self.intensity_key
+
+        data_params.positions = u.Param()
+        data_params.positions.file = self.positions_file
+        data_params.positions.slow_key = self.positions_slow_key
+        data_params.positions.fast_key = self.positions_fast_key
+
+        data_params.frameorder = u.Param()
+        data_params.frameorder.indices = frameorder
+
+        output = PtyscanTestRunner(Hdf5Loader, data_params, auto_frames=len(frameorder), cleanup=False)
+
+        with h5.File(output['output_file'], 'r') as f:
+            out_data = f['chunks/0/data'][...].squeeze()
+        ground_truth = data.reshape((-1, frame_size_m, frame_size_n))[frameorder]
+
+        np.testing.assert_equal(ground_truth.shape, out_data.shape,
+                                err_msg="The shapes don't match for the positions for case 2 with different order of frames")
+        np.testing.assert_array_equal(ground_truth, out_data,
+                                      err_msg='There is something up with the positions for case 2 with different order of frames')
+
+
     def test_flatfield_applied_case_2(self):
         '''
         Applies the flatfield and assumes it is shaped like a single frame
@@ -865,6 +1016,58 @@ def test_position_data_mapping_case_3_with_skipping(self):
         np.testing.assert_array_equal(out_data, ground_truth,
                                       err_msg='There is something up with the positions for case 4 with skipping')
 
+    def test_position_data_mapping_case_3_with_frameorder(self):
+        '''
+        axis_data.shape (C, D) for data.shape (C*D, frame_size_m, frame_size_n) ,
+        '''
+        C = 10
+        D = 11
+        frame_size_m = 5
+        frame_size_n = 5
+
+        positions_slow = np.arange(C)
+        positions_fast = np.arange(D)
+        fast, slow = np.meshgrid(positions_fast, positions_slow) # just pretend it's a simple grid
+        # now chuck them in the files
+        with h5.File(self.positions_file, 'w') as f:
+            f[self.positions_slow_key] = slow
+            f[self.positions_fast_key] = fast
+
+        # make up some data ...
+        data = np.arange(C*D*frame_size_m*frame_size_n).reshape(C*D, frame_size_m, frame_size_n)
+        with h5.File(self.intensity_file, 'w') as f:
+            f[self.intensity_key] = data
+
+        # create frameorder array of indices
+        frameorder = np.hstack([np.arange(C*D), np.random.randint(C*D, size=int(0.1*C*D))])
+        np.random.shuffle(frameorder)
+
+        data_params = u.Param()
+        data_params.auto_center = False
+        data_params.intensities = u.Param()
+        data_params.intensities.file = self.intensity_file
+        data_params.intensities.key = self.intensity_key
+
+        data_params.positions = u.Param()
+        data_params.positions.file = self.positions_file
+        data_params.positions.slow_key = self.positions_slow_key
+        data_params.positions.fast_key = self.positions_fast_key
+
+        data_params.frameorder = u.Param()
+        data_params.frameorder.indices = frameorder
+
+        output = PtyscanTestRunner(Hdf5Loader, data_params, auto_frames=len(frameorder), cleanup=False)
+
+        with h5.File(output['output_file'], 'r') as f:
+            out_data = f['chunks/0/data'][...].squeeze()
+        ground_truth = data.reshape((-1, frame_size_m, frame_size_n))[frameorder]
+
+        np.testing.assert_equal(out_data.shape, ground_truth.shape,
+                                err_msg="The shapes don't match for the positions for case 4 with different order of frames")
+        np.testing.assert_array_equal(out_data, ground_truth,
+                                      err_msg='There is something up with the positions for case 4 with different order of frames')
+
+
     def test_position_data_mapping_case_4(self):
         '''
         axis_data.shape (C,) for data.shape (C, D, frame_size_m, frame_size_n) where D is the size of the other axis,
@@ -1008,6 +1211,57 @@ def test_position_data_mapping_case_4_with_skipping(self):
         np.testing.assert_array_equal(out_data, ground_truth,
                                       err_msg='There is something up with the positions for case 4 with skipping')
 
+
+    def test_position_data_mapping_case_4_with_frameorder(self):
+        '''
+        axis_data.shape (C,) for data.shape (C, D, frame_size_m, frame_size_n) where D is the size of the other axis,
+        '''
+        C = 4
+        D = 8
+        frame_size_m = 5
+        frame_size_n = 5
+
+        slow = np.arange(C)
+        fast = np.arange(D)
+        # now chuck them in the files
+        with h5.File(self.positions_file, 'w') as f:
+            f[self.positions_slow_key] = slow
+            f[self.positions_fast_key] = fast
+
+        # make up some data ...
+        data = np.arange(C*D*frame_size_m*frame_size_n).reshape(C, D, frame_size_m, frame_size_n)
+        with h5.File(self.intensity_file, 'w') as f:
+            f[self.intensity_key] = data
+
+        # create frameorder array of indices
+        frameorder = np.hstack([np.arange(C*D), np.random.randint(C*D, size=int(0.1*C*D))])
+        np.random.shuffle(frameorder)
+
+        data_params = u.Param()
+        data_params.auto_center = False
+        data_params.intensities = u.Param()
+        data_params.intensities.file = self.intensity_file
+        data_params.intensities.key = self.intensity_key
+
+        data_params.positions = u.Param()
+        data_params.positions.file = self.positions_file
+        data_params.positions.slow_key = self.positions_slow_key
+        data_params.positions.fast_key = self.positions_fast_key
+
+        data_params.frameorder = u.Param()
+        data_params.frameorder.indices = frameorder
+
+        output = PtyscanTestRunner(Hdf5Loader, data_params, auto_frames=len(frameorder), cleanup=False)
+
+        with h5.File(output['output_file'], 'r') as f:
+            out_data = f['chunks/0/data'][...].squeeze()
+        ground_truth = data.reshape((-1, frame_size_m, frame_size_n))[frameorder]
+        np.testing.assert_equal(out_data.shape, ground_truth.shape,
+                                err_msg="The shapes don't match for the positions for case 4 with different order of frames")
+        np.testing.assert_array_equal(out_data, ground_truth,
+                                      err_msg='There is something up with the positions for case 4 with different order of frames')
+
+
     def test_position_data_mapping_case_5(self):
         '''
         axis_data.shape (C,) for data.shape (C*D, frame_size_m, frame_size_n) where D is the size of the other axis.

From a7b635d2e75ccea9e784824ed4f7c8511cf0f224 Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 2 Feb 2024 16:49:23 +0000
Subject: [PATCH 35/37] Changes to Cupy backend (#483)

* Improve FFT chooser for GPUs

* improve logging of available GPU memory

* formatting and more info about GPU memory

* separate filtered FFT from Cupy FFT

* reorganised engine templates

* fixes need to get EPIE_cupy engine working

* load both pycuda and cupy

* separate filtered FFT from Skcuda fft (which is broken atm)

* import was missing

* changed named of templates folder
---
 ptypy/__init__.py                             |  2 +-
 ptypy/accelerate/cuda_cupy/cufft.py           | 29 +++++--
 ptypy/accelerate/cuda_cupy/engines/ML_cupy.py |  2 +
 .../cuda_cupy/engines/projectional_cupy.py    | 57 ++++++--------
 .../engines/projectional_cupy_stream.py       | 78 +++++++------------
 .../cuda_cupy/engines/stochastic.py           | 20 +++--
 ptypy/accelerate/cuda_cupy/kernels.py         | 10 ++-
 ptypy/accelerate/cuda_pycuda/cufft.py         | 51 +++++++++---
 .../cuda_pycuda/engines/ML_pycuda.py          |  3 +-
 .../engines/projectional_pycuda.py            |  4 +
 .../engines/projectional_pycuda_stream.py     |  5 +-
 .../cuda_pycuda/engines/stochastic.py         |  4 +-
 ptypy/accelerate/cuda_pycuda/kernels.py       |  1 +
 .../engines/cupy/moonflower_DM_ML_cupy.py     | 68 ++++++++++++++++
 templates/engines/cupy/moonflower_DM_cupy.py  | 57 ++++++++++++++
 .../cupy/moonflower_DM_cupy_nostream.py       | 58 ++++++++++++++
 .../engines/cupy/moonflower_EPIE_ML_cupy.py   | 75 ++++++++++++++++++
 .../engines/cupy/moonflower_EPIE_cupy.py      | 59 ++++++++++++++
 .../engines/cupy/moonflower_ML_ML_cupy.py     | 72 +++++++++++++++++
 templates/engines/cupy/moonflower_ML_cupy.py  | 63 +++++++++++++++
 .../engines/cupy/moonflower_RAAR_ML_cupy.py   | 69 ++++++++++++++++
 .../engines/cupy/moonflower_RAAR_cupy.py      | 58 ++++++++++++++
 templates/engines/cupy/moonflower_SDR_cupy.py | 61 +++++++++++++++
 .../engines/{ => legacy}/moonflower_DM_ocl.py |  0
 .../engines/{ => numpy}/moonflower_DM.py      |  0
 .../engines/{ => numpy}/moonflower_DM_ML.py   |  0
 .../engines/{ => numpy}/moonflower_EPIE.py    |  0
 .../{ => numpy}/moonflower_ML_Euclid.py       |  0
 .../{ => numpy}/moonflower_ML_Gaussian.py     |  0
 .../engines/{ => numpy}/moonflower_ML_ML.py   |  0
 .../{ => numpy}/moonflower_ML_Poisson.py      |  0
 .../engines/{ => numpy}/moonflower_RAAR.py    |  0
 .../engines/{ => numpy}/moonflower_RAAR_ML.py |  0
 .../engines/{ => numpy}/moonflower_SDR.py     |  0
 .../{ => pycuda}/moonflower_DM_ML_pycuda.py   |  0
 .../{ => pycuda}/moonflower_DM_pycuda.py      |  0
 .../moonflower_DM_pycuda_nostream.py          |  0
 .../{ => pycuda}/moonflower_EPIE_ML_pycuda.py |  0
 .../{ => pycuda}/moonflower_EPIE_pycuda.py    |  0
 .../{ => pycuda}/moonflower_ML_ML_pycuda.py   |  0
 .../{ => pycuda}/moonflower_ML_pycuda.py      |  0
 .../{ => pycuda}/moonflower_RAAR_ML_pycuda.py |  0
 .../{ => pycuda}/moonflower_RAAR_pycuda.py    |  0
 .../{ => pycuda}/moonflower_SDR_pycuda.py     |  0
 .../{ => serial}/moonflower_DM_serial.py      |  0
 .../{ => serial}/moonflower_EPIE_serial.py    |  0
 .../{ => serial}/moonflower_ML_serial.py      |  0
 .../{ => serial}/moonflower_RAAR_serial.py    |  0
 .../{ => serial}/moonflower_SDR_serial.py     |  0
 .../cuda_cupy_tests/fft_scaling_test.py       | 62 +++++++++++++--
 .../cuda_pycuda_tests/fft_scaling_test.py     | 16 ++++
 51 files changed, 858 insertions(+), 126 deletions(-)
 create mode 100644 templates/engines/cupy/moonflower_DM_ML_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_DM_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_DM_cupy_nostream.py
 create mode 100644 templates/engines/cupy/moonflower_EPIE_ML_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_EPIE_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_ML_ML_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_ML_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_RAAR_ML_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_RAAR_cupy.py
 create mode 100644 templates/engines/cupy/moonflower_SDR_cupy.py
 rename templates/engines/{ => legacy}/moonflower_DM_ocl.py (100%)
 rename templates/engines/{ => numpy}/moonflower_DM.py (100%)
 rename templates/engines/{ => numpy}/moonflower_DM_ML.py (100%)
 rename templates/engines/{ => numpy}/moonflower_EPIE.py (100%)
 rename templates/engines/{ => numpy}/moonflower_ML_Euclid.py (100%)
 rename templates/engines/{ => numpy}/moonflower_ML_Gaussian.py (100%)
 rename templates/engines/{ => numpy}/moonflower_ML_ML.py (100%)
 rename templates/engines/{ => numpy}/moonflower_ML_Poisson.py (100%)
 rename templates/engines/{ => numpy}/moonflower_RAAR.py (100%)
 rename templates/engines/{ => numpy}/moonflower_RAAR_ML.py (100%)
 rename templates/engines/{ => numpy}/moonflower_SDR.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_DM_ML_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_DM_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_DM_pycuda_nostream.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_EPIE_ML_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_EPIE_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_ML_ML_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_ML_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_RAAR_ML_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_RAAR_pycuda.py (100%)
 rename templates/engines/{ => pycuda}/moonflower_SDR_pycuda.py (100%)
 rename templates/engines/{ => serial}/moonflower_DM_serial.py (100%)
 rename templates/engines/{ => serial}/moonflower_EPIE_serial.py (100%)
 rename templates/engines/{ => serial}/moonflower_ML_serial.py (100%)
 rename templates/engines/{ => serial}/moonflower_RAAR_serial.py (100%)
 rename templates/engines/{ => serial}/moonflower_SDR_serial.py (100%)

diff --git a/ptypy/__init__.py b/ptypy/__init__.py
index 5b34c35fa..74c336d01 100644
--- a/ptypy/__init__.py
+++ b/ptypy/__init__.py
@@ -83,7 +83,7 @@ def load_gpu_engines(arch='cuda'):
         from .accelerate.cuda_pycuda.engines import projectional_pycuda_stream
         from .accelerate.cuda_pycuda.engines import stochastic
         from .accelerate.cuda_pycuda.engines import ML_pycuda
-    if arch=='cupy':
+    if arch in ['cuda', 'cupy']:
         from .accelerate.cuda_cupy.engines import projectional_cupy
         from .accelerate.cuda_cupy.engines import projectional_cupy_stream
         from .accelerate.cuda_cupy.engines import stochastic
diff --git a/ptypy/accelerate/cuda_cupy/cufft.py b/ptypy/accelerate/cuda_cupy/cufft.py
index 707aba2f7..450d6455e 100644
--- a/ptypy/accelerate/cuda_cupy/cufft.py
+++ b/ptypy/accelerate/cuda_cupy/cufft.py
@@ -4,8 +4,7 @@
 from . import load_kernel
 import numpy as np
 
-
-class FFT_cuda(object):
+class FFT_base(object):
 
     def __init__(self, array, queue=None,
                  inplace=False,
@@ -18,17 +17,31 @@ def __init__(self, array, queue=None,
         if dims < 2:
             raise AssertionError('Input array must be at least 2-dimensional')
         self.arr_shape = (array.shape[-2], array.shape[-1])
-        rows = self.arr_shape[0]
-        columns = self.arr_shape[1]
-        if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
-            raise ValueError(
-                "CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
         self.batches = int(np.prod(
             array.shape[0:dims-2]) if dims > 2 else 1)
         self.forward = forward
 
         self._load(array, pre_fft, post_fft, symmetric, forward)
 
+class FFT_cuda(FFT_base):
+
+    def __init__(self, array, queue=None,
+                 inplace=False,
+                 pre_fft=None,
+                 post_fft=None,
+                 symmetric=True,
+                 forward=True):
+        rows, columns = (array.shape[-2], array.shape[-1])
+        if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
+            raise ValueError(
+                "CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
+        super(FFT_cuda, self).__init__(array, queue=queue, 
+                                       inplace=inplace,
+                                       pre_fft=pre_fft,
+                                       post_fft=post_fft,
+                                       symmetric=symmetric,
+                                       forward=forward)
+
     def _load(self, array, pre_fft, post_fft, symmetric, forward):
         if pre_fft is not None:
             self.pre_fft = cp.asarray(pre_fft)
@@ -71,7 +84,7 @@ def _ift(self, input, output):
         self.fftobj.ifft(input.data.ptr, output.data.ptr)
 
 
-class FFT_cupy(FFT_cuda):
+class FFT_cupy(FFT_base):
 
     @property
     def queue(self):
diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
index c3cb39c09..efcc42338 100644
--- a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
+++ b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py
@@ -165,6 +165,8 @@ def _setup_kernels(self):
         # TODO grow blocks dynamically
         nma = min(fit, MAX_BLOCKS)
         log_device_memory_stats(4)
+        log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
+        log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3)))
         log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma))
         # reset memory or create new
         self.w_data = GpuDataManager(ma_mem, 0, nma, False)
diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
index f0c6ba40a..45eb4d016 100644
--- a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
+++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py
@@ -9,12 +9,11 @@
 """
 
 import numpy as np
-import time
 import cupy as cp
 
 from ptypy import utils as u
-from ptypy.accelerate.cuda_cupy import get_context, log_device_memory_stats
-from ptypy.utils.verbose import logger, log
+from ptypy.accelerate.cuda_cupy import get_context
+from ptypy.utils.verbose import log
 from ptypy.utils import parallel
 from ptypy.engines import register
 from ptypy.engines.projectional import DMMixin, RAARMixin
@@ -119,12 +118,16 @@ def _setup_kernels(self):
             # create buffer arrays
             ash = (fpc * nmodes,) + tuple(geo.shape)
             aux = np.zeros(ash, dtype=np.complex64)
+            mempool = cp.get_default_memory_pool()
+            mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes()
+            if not int(mem) // aux.nbytes:
+                log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
+                raise SystemExit("ptypy has been exited.")
             kern.aux = cp.asarray(aux)
 
             # setup kernels, one for each SCAN.
             log(4, "Setting up FourierUpdateKernel")
-            kern.FUK = FourierUpdateKernel(
-                aux, nmodes, queue_thread=self.queue)
+            kern.FUK = FourierUpdateKernel(aux, nmodes, queue_thread=self.queue)
             kern.FUK.allocate()
 
             log(4, "Setting up PoUpdateKernel")
@@ -142,15 +145,13 @@ def _setup_kernels(self):
             kern.TK = TransposeKernel(queue=self.queue)
 
             log(4, "Setting up PropagationKernel")
-            kern.PROP = PropagationKernel(
-                aux, geo.propagator, self.queue, self.p.fft_lib)
+            kern.PROP = PropagationKernel(aux, geo.propagator, self.queue, self.p.fft_lib)
             kern.PROP.allocate()
             kern.resolution = geo.resolution[0]
 
             if self.do_position_refinement:
                 log(4, "Setting up PositionCorrectionKernel")
-                kern.PCK = PositionCorrectionKernel(
-                    aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
+                kern.PCK = PositionCorrectionKernel(aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue)
                 kern.PCK.allocate()
             log(4, "Kernel setup completed")
 
@@ -179,8 +180,7 @@ def engine_prepare(self):
             prep = self.diff_info[d.ID]
             prep.addr_gpu = cp.asarray(prep.addr)
             if use_tiles:
-                prep.addr2 = np.ascontiguousarray(
-                    np.transpose(prep.addr, (2, 3, 0, 1)))
+                prep.addr2 = np.ascontiguousarray(np.transpose(prep.addr, (2, 3, 0, 1)))
                 prep.addr2_gpu = cp.asarray(prep.addr2)
             if self.do_position_refinement:
                 prep.mangled_addr_gpu = prep.addr_gpu.copy()
@@ -262,8 +262,7 @@ def engine_iterate(self, num=1):
 
                 # build exit wave
                 #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b,
-                              c_po=self._a, c_e=-(self._a + self._b))
+                AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b))
                 FUK.exit_error(aux, addr)
                 FUK.error_reduce(addr, err_exit)
 
@@ -294,8 +293,7 @@ def engine_iterate(self, num=1):
             err_fourier = prep.err_fourier_gpu.get()
             err_phot = prep.err_phot_gpu.get()
             err_exit = prep.err_exit_gpu.get()
-            errs = np.ascontiguousarray(
-                np.vstack([err_fourier, err_phot, err_exit]).T)
+            errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
             error.update(zip(prep.view_IDs, errs))
 
         self.error = error
@@ -307,12 +305,9 @@ def position_update(self):
         """
         if not self.do_position_refinement or (not self.curiter):
             return
-        do_update_pos = (self.p.position_refinement.stop >
-                         self.curiter >= self.p.position_refinement.start)
-        do_update_pos &= (self.curiter %
-                          self.p.position_refinement.interval) == 0
-        use_tiles = (not self.p.probe_update_cuda_atomics) or (
-            not self.p.object_update_cuda_atomics)
+        do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
+        do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics)
 
         # Update positions
         if do_update_pos:
@@ -364,18 +359,15 @@ def position_update(self):
 
                 log(4, 'Position refinement trial: iteration %s' % (self.curiter))
                 for i in range(PCK.mangler.nshifts):
-                    PCK.mangler.get_address(
-                        i, addr, mangled_addr, max_oby, max_obx)
+                    PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
                     PCK.build_aux(aux, mangled_addr, ob, pr)
                     PROP.fw(aux, aux)
                     if self.p.position_refinement.metric == "fourier":
                         PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
                         PCK.error_reduce(mangled_addr, err_fourier)
                     if self.p.position_refinement.metric == "photon":
-                        PCK.log_likelihood(
-                            aux, mangled_addr, mag, ma, err_fourier)
-                    PCK.update_addr_and_error_state(
-                        addr, error_state, mangled_addr, err_fourier)
+                        PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier)
+                    PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
 
                 cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr,
                                             src=error_state.data.ptr,
@@ -413,8 +405,7 @@ def center_probe(self):
                     prep = self.diff_info[dID]
                     pID, oID, eID = prep.poe_IDs
                     if pID == name:
-                        self.ex.S[eID].gpu = self.ISK.interpolate_shift(
-                            self.ex.S[eID].gpu, shift)
+                        self.ex.S[eID].gpu = self.ISK.interpolate_shift(self.ex.S[eID].gpu, shift)
 
                 log(4, 'Probe recentered from %s to %s'
                     % (str(tuple(c1)), str(tuple(c2))))
@@ -533,8 +524,7 @@ def support_constraint(self, storage=None):
         if support is not None:
             if storage.ID not in self.FSK:
                 supp = support.astype(np.complex64)
-                self.FSK[storage.ID] = FourierSupportKernel(
-                    supp, self.queue, self.p.fft_lib)
+                self.FSK[storage.ID] = FourierSupportKernel(supp, self.queue, self.p.fft_lib)
                 self.FSK[storage.ID].allocate()
             self.FSK[storage.ID].apply_fourier_support(storage.gpu)
 
@@ -542,8 +532,7 @@ def support_constraint(self, storage=None):
         support = self._probe_support.get(storage.ID)
         if support is not None:
             if storage.ID not in self.RSK:
-                self.RSK[storage.ID] = RealSupportKernel(
-                    support.astype(np.complex64))
+                self.RSK[storage.ID] = RealSupportKernel(support.astype(np.complex64))
                 self.RSK[storage.ID].allocate()
             self.RSK[storage.ID].apply_real_support(storage.gpu)
 
@@ -584,13 +573,11 @@ def engine_finalize(self):
             prep.addr = prep.addr_gpu.get()
             del prep.addr_gpu
 
-
         mempool = cp.get_default_memory_pool()
         mempool.free_all_blocks()
         pinned_pool = cp.get_default_pinned_memory_pool()
         pinned_pool.free_all_blocks()
 
-
         # we don't need the  "benchmarking" in DM_serial
         super().engine_finalize(benchmark=False)
 
diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
index b64ad5e82..b236874ab 100644
--- a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
+++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py
@@ -14,13 +14,11 @@
 """
 
 import numpy as np
-import time
 import cupy as cp
 import cupyx
 
-from ptypy import utils as u
 from ptypy.accelerate.cuda_cupy import log_device_memory_stats
-from ptypy.utils.verbose import log, logger
+from ptypy.utils.verbose import log
 from ptypy.utils import parallel
 from ptypy.engines import register
 from ptypy.engines.projectional import DMMixin, RAARMixin
@@ -78,8 +76,9 @@ def _setup_kernels(self):
         nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
         nma = min(fit, MAX_BLOCKS)
         log_device_memory_stats(4)
-        log(4, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(
-            nex, nma))
+        log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
+        log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3)))
+        log(4, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
         # reset memory or create new
         self.ex_data = GpuDataManager(ex_mem, 0, nex, True)
         self.ma_data = GpuDataManager(ma_mem, 0, nma, False)
@@ -102,8 +101,7 @@ def engine_prepare(self):
         for name, s in self.pr_nrm.S.items():
             s.gpu, s.data = mppa(s.data)
 
-        use_tiles = (not self.p.probe_update_cuda_atomics) or (
-            not self.p.object_update_cuda_atomics)
+        use_tiles = (not self.p.probe_update_cuda_atomics) or (not self.p.object_update_cuda_atomics)
 
         # Extra object buffer for smoothing kernel
         if self.p.obj_smooth_std is not None:
@@ -143,8 +141,7 @@ def engine_prepare(self):
             prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C")
             prep.mag[:] = mag
 
-            log(4, 'Free memory on device: %.2f GB' %
-                (float(cp.cuda.runtime.memGetInfo()[0])/1e9))
+            log(4, 'Free memory on device: {:.2f} GB'.format(float(cp.cuda.runtime.memGetInfo()[0])/(1024**3)))
             self.ex_data.add_data_block()
             self.ma_data.add_data_block()
             self.mag_data.add_data_block()
@@ -168,8 +165,7 @@ def engine_iterate(self, num=1):
                 change = 0
 
                 do_update_probe = (self.curiter >= self.p.probe_update_start)
-                do_update_object = (self.p.update_object_first or (
-                    inner > 0) or not do_update_probe)
+                do_update_object = (self.p.update_object_first or (inner > 0) or not do_update_probe)
                 do_update_fourier = (inner == 0)
 
                 # initialize probe and object buffer to receive an update
@@ -185,8 +181,7 @@ def engine_iterate(self, num=1):
                                           self.p.obj_smooth_std]
                             # We need a third copy, because we still need ob.gpu for the fourier update
                             obb.gpu[:] = ob.gpu[:]
-                            self.GSK.convolution(
-                                obb.gpu, smooth_mfs, tmp=obb.tmp)
+                            self.GSK.convolution(obb.gpu, smooth_mfs, tmp=obb.tmp)
                             obb.gpu *= np.complex64(cfact)
                         else:
                             # obb.gpu[:] = ob.gpu * np.complex64(cfact)
@@ -225,8 +220,7 @@ def engine_iterate(self, num=1):
                     pr = self.pr.S[pID].gpu
 
                     # Schedule ex to device
-                    ev_ex, ex, data_ex = self.ex_data.to_gpu(
-                        prep.ex, dID, self.qu_htod)
+                    ev_ex, ex, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod)
 
                     # Fourier update.
                     if do_update_fourier:
@@ -234,10 +228,8 @@ def engine_iterate(self, num=1):
                         log(4, '----- Fourier update -----', True)
 
                         # Schedule ma & mag to device
-                        ev_ma, ma, data_ma = self.ma_data.to_gpu(
-                            prep.ma, dID, self.qu_htod)
-                        ev_mag, mag, data_mag = self.mag_data.to_gpu(
-                            prep.mag, dID, self.qu_htod)
+                        ev_ma, ma, data_ma = self.ma_data.to_gpu(prep.ma, dID, self.qu_htod)
+                        ev_mag, mag, data_mag = self.mag_data.to_gpu(prep.mag, dID, self.qu_htod)
 
                         # compute log-likelihood
                         if self.p.compute_log_likelihood:
@@ -250,8 +242,7 @@ def engine_iterate(self, num=1):
                         # synchronize h2d stream with compute stream
                         self.queue.wait_event(ev_ex)
                         #AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                        AWK.make_aux(aux, addr, ob, pr, ex,
-                                     c_po=self._c, c_e=1-self._c)
+                        AWK.make_aux(aux, addr, ob, pr, ex, c_po=self._c, c_e=1-self._c)
 
                         # FFT
                         PROP.fw(aux, aux)
@@ -261,8 +252,7 @@ def engine_iterate(self, num=1):
                         self.queue.wait_event(ev_mag)
                         FUK.fourier_error(aux, addr, mag, ma, ma_sum)
                         FUK.error_reduce(addr, err_fourier)
-                        FUK.fmag_all_update(
-                            aux, addr, mag, ma, err_fourier, pbound)
+                        FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound)
 
                         data_mag.record_done(self.queue, 'compute')
                         data_ma.record_done(self.queue, 'compute')
@@ -270,21 +260,18 @@ def engine_iterate(self, num=1):
                         PROP.bw(aux, aux)
                         # apply changes
                         #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha)
-                        AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b,
-                                      c_po=self._a, c_e=-(self._a + self._b))
+                        AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, c_po=self._a, c_e=-(self._a + self._b))
                         FUK.exit_error(aux, addr)
                         FUK.error_reduce(addr, err_exit)
 
-                    prestr = '%d Iteration (Overlap) #%02d:  ' % (
-                        parallel.rank, inner)
+                    prestr = '%d Iteration (Overlap) #%02d:  ' % (parallel.rank, inner)
 
                     # Update object
                     if do_update_object:
                         log(4, prestr + '----- object update -----', True)
                         addrt = addr if atomics_object else addr2
                         self.queue.wait_event(ev_ex)
-                        POK.ob_update(addrt, obb, obn, pr, ex,
-                                      atomics=atomics_object)
+                        POK.ob_update(addrt, obb, obn, pr, ex, atomics=atomics_object)
 
                     data_ex.record_done(self.queue, 'compute')
                     if iblock + len(self.ex_data) < len(self.dID_list):
@@ -327,10 +314,8 @@ def engine_iterate(self, num=1):
             parallel.barrier()
 
             if self.do_position_refinement and (self.curiter):
-                do_update_pos = (self.p.position_refinement.stop >
-                                 self.curiter >= self.p.position_refinement.start)
-                do_update_pos &= (self.curiter %
-                                  self.p.position_refinement.interval) == 0
+                do_update_pos = (self.p.position_refinement.stop > self.curiter >= self.p.position_refinement.start)
+                do_update_pos &= (self.curiter % self.p.position_refinement.interval) == 0
 
                 # Update positions
                 if do_update_pos:
@@ -387,22 +372,17 @@ def engine_iterate(self, num=1):
 
                         log(4, 'Position refinement trial: iteration %s' %
                             (self.curiter))
-                        PCK.mangler.setup_shifts(
-                            self.curiter, nframes=addr.shape[0])
+                        PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0])
                         for i in range(PCK.mangler.nshifts):
-                            PCK.mangler.get_address(
-                                i, addr, mangled_addr, max_oby, max_obx)
+                            PCK.mangler.get_address(i, addr, mangled_addr, max_oby, max_obx)
                             PCK.build_aux(aux, mangled_addr, ob, pr)
                             PROP.fw(aux, aux)
                             if self.p.position_refinement.metric == "fourier":
-                                PCK.fourier_error(
-                                    aux, mangled_addr, mag, ma, ma_sum)
+                                PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum)
                                 PCK.error_reduce(mangled_addr, err_fourier)
                             if self.p.position_refinement.metric == "photon":
-                                PCK.log_likelihood(
-                                    aux, mangled_addr, mag, ma, err_fourier)
-                            PCK.update_addr_and_error_state(
-                                addr, error_state, mangled_addr, err_fourier)
+                                PCK.log_likelihood( aux, mangled_addr, mag, ma, err_fourier)
+                            PCK.update_addr_and_error_state(addr, error_state, mangled_addr, err_fourier)
 
                         data_mag.record_done(self.queue, 'compute')
                         data_ma.record_done(self.queue, 'compute')
@@ -412,12 +392,9 @@ def engine_iterate(self, num=1):
                                                kind=3, # d2d
                                                stream=self.queue.ptr)
                         if use_tiles:
-                            s1 = prep.addr_gpu.shape[0] * \
-                                prep.addr_gpu.shape[1]
-                            s2 = prep.addr_gpu.shape[2] * \
-                                prep.addr_gpu.shape[3]
-                            TK.transpose(prep.addr_gpu.reshape(
-                                s1, s2), prep.addr2_gpu.reshape(s2, s1))
+                            s1 = prep.addr_gpu.shape[0] * prep.addr_gpu.shape[1]
+                            s2 = prep.addr_gpu.shape[2] * prep.addr_gpu.shape[3]
+                            TK.transpose(prep.addr_gpu.reshape(s1, s2), prep.addr2_gpu.reshape(s2, s1))
 
             self.curiter += 1
             self.queue.synchronize()
@@ -436,8 +413,7 @@ def engine_iterate(self, num=1):
             err_fourier = prep.err_fourier_gpu.get()
             err_phot = prep.err_phot_gpu.get()
             err_exit = prep.err_exit_gpu.get()
-            errs = np.ascontiguousarray(
-                np.vstack([err_fourier, err_phot, err_exit]).T)
+            errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T)
             error.update(zip(prep.view_IDs, errs))
 
         self.error = error
diff --git a/ptypy/accelerate/cuda_cupy/engines/stochastic.py b/ptypy/accelerate/cuda_cupy/engines/stochastic.py
index 8af49d635..f798d569e 100644
--- a/ptypy/accelerate/cuda_cupy/engines/stochastic.py
+++ b/ptypy/accelerate/cuda_cupy/engines/stochastic.py
@@ -330,10 +330,10 @@ def engine_iterate(self, num=1):
                     if self._object_norm_is_global and self._pr_a == 0:
                         obn_max = cp.empty((1,), dtype=np.float32)
                         MAK.max_abs2(ob, obn_max)
-                        obn.fill(np.float32(0.), stream=self.queue)
+                        obn.fill(np.float32(0.))
                     else:
                         POK.ob_norm_local(addr, ob, obn)
-                        obn_max = cp.max(obn, stream=self.queue)
+                        obn_max = cp.max(obn)
                     if self.p.probe_update_start <= self.curiter:
                         POK.pr_update_local(
                             addr, pr, ob, ex, aux, obn, obn_max, a=self._pr_a, b=self._pr_b)
@@ -360,9 +360,19 @@ def engine_iterate(self, num=1):
         self.queue.synchronize()
 
         for name, s in self.ob.S.items():
-            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+            #s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+            cp.cuda.runtime.memcpyAsync(dst=s.data.ctypes.data,
+                            src=s.gpu.data.ptr,
+                            size=s.gpu.nbytes,
+                            kind=2,  # d2h
+                            stream=self.queue.ptr)
         for name, s in self.pr.S.items():
-            s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+            #s.gpu.get_async(stream=self.qu_dtoh, ary=s.data)
+            cp.cuda.runtime.memcpyAsync(dst=s.data.ctypes.data,
+                            src=s.gpu.data.ptr,
+                            size=s.gpu.nbytes,
+                            kind=2,  # d2h
+                            stream=self.queue.ptr)
 
         for dID, prep in self.diff_info.items():
             err_fourier = prep.err_fourier_gpu.get()
@@ -503,7 +513,7 @@ def engine_finalize(self):
         for name, s in self.ob.S.items():
             s.data = np.copy(s.data)
 
-        self.context.detach()
+        #self.context.detach()
         super().engine_finalize()
 
 
diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py
index 6d4de55dd..049108e71 100644
--- a/ptypy/accelerate/cuda_cupy/kernels.py
+++ b/ptypy/accelerate/cuda_cupy/kernels.py
@@ -15,16 +15,18 @@ def choose_fft(arr_shape, fft_type=None):
     columns = arr_shape[1]
     if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
         dims_are_powers_of_two = False
-    if dims_are_powers_of_two:
+    if fft_type=='cuda' and not dims_are_powers_of_two:
+        logger.warning('cufft: array dimensions are not powers of two (16 to 2048) - using cufft with seperated callbacks')
+        from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT
+    elif fft_type=='cuda' and dims_are_powers_of_two:
         try:
+            import filtered_cufft
             from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda as FFT
         except:
-            logger.info(
+            logger.warning(
                 'Unable to import optimised cufft version - using cufft with separte callbacks instead')
             from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT
     else:
-        logger.info(
-            'cufft: array dimensions are not powers of two (16 to 2048) - using cufft with separated callbacks')
         from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT
     return FFT
 
diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py
index 4859b36b2..5364f092d 100644
--- a/ptypy/accelerate/cuda_pycuda/cufft.py
+++ b/ptypy/accelerate/cuda_pycuda/cufft.py
@@ -1,10 +1,9 @@
-import skcuda.fft as cu_fft
-from skcuda.fft import cufft as cufftlib
+
 from pycuda import gpuarray
 from . import load_kernel
 import numpy as np
 
-class FFT_cuda(object):
+class FFT_base(object):
 
     def __init__(self, array, queue=None,
                  inplace=False,
@@ -17,15 +16,29 @@ def __init__(self, array, queue=None,
         if dims < 2:
             raise AssertionError('Input array must be at least 2-dimensional')
         self.arr_shape = (array.shape[-2], array.shape[-1])
-        rows = self.arr_shape[0]
-        columns = self.arr_shape[1]
-        if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
-            raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
         self.batches = int(np.prod(array.shape[0:dims-2]) if dims > 2 else 1)
         self.forward = forward
 
         self._load(array, pre_fft, post_fft, symmetric, forward)
 
+class FFT_cuda(FFT_base):
+
+    def __init__(self, array, queue=None,
+                 inplace=False,
+                 pre_fft=None,
+                 post_fft=None,
+                 symmetric=True,
+                 forward=True):
+        rows, columns = (array.shape[-2], array.shape[-1])
+        if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]:
+            raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048")
+        super(FFT_cuda, self).__init__(array, queue=queue, 
+                                       inplace=inplace,
+                                       pre_fft=pre_fft,
+                                       post_fft=post_fft,
+                                       symmetric=symmetric,
+                                       forward=forward)
+        
     def _load(self, array, pre_fft, post_fft, symmetric, forward):
         if pre_fft is not None:
             self.pre_fft = gpuarray.to_gpu(pre_fft)
@@ -68,7 +81,23 @@ def _ift(self, input, output):
         self.fftobj.ifft(input.gpudata, output.gpudata)
         
 
-class FFT_skcuda(FFT_cuda):
+class FFT_skcuda(FFT_base):
+
+    def __init__(self, array, queue=None,
+                 inplace=False,
+                 pre_fft=None,
+                 post_fft=None,
+                 symmetric=True,
+                 forward=True):
+        import skcuda.fft as cu_fft
+        self._fft = cu_fft.fft
+        self._ifft = cu_fft.ifft
+        super(FFT_cuda, self).__init__(array, queue=queue, 
+                                inplace=inplace,
+                                pre_fft=pre_fft,
+                                post_fft=post_fft,
+                                symmetric=symmetric,
+                                forward=forward)
 
     @property
     def queue(self):
@@ -77,6 +106,7 @@ def queue(self):
     @queue.setter
     def queue(self, queue):
         self._queue = queue
+        from skcuda.fft import cufft as cufftlib
         cufftlib.cufftSetStream(self.plan.handle, queue.handle)
 
     def _load(self, array, pre_fft, post_fft, symmetric, forward):
@@ -112,6 +142,7 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward):
             int((self.arr_shape[1] + 31) // 32),
             int(self.batches)
         )
+        import skcuda.fft as cu_fft
         self.plan = cu_fft.Plan(
             self.arr_shape,
             array.dtype,
@@ -166,11 +197,11 @@ def _postfilt(self, y):
 
     def _ft(self, x, y):
         d = self._prefilt(x, y)
-        cu_fft.fft(d, y, self.plan)
+        self._fft(d, y, self.plan)
         self._postfilt(y)
 
     def _ift(self, x, y):
         d = self._prefilt(x, y)
-        cu_fft.ifft(d, y, self.plan)
+        self._ifft(d, y, self.plan)
         self._postfilt(y)
 
diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
index 9799e4a5c..339102452 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py
@@ -166,7 +166,8 @@ def _setup_kernels(self):
 
         # TODO grow blocks dynamically
         nma = min(fit, MAX_BLOCKS)
-        log(4, 'Free memory on device: %.2f GB' % (float(mem)/1e9))
+        log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
+        log(4, 'Memory to be allocated per block {:.2f} GB'.format(float(blk)/(1024**3)))
         log(4, 'PyCUDA max blocks fitting on GPU: ma_arrays={}'.format(nma))
         # reset memory or create new
         self.w_data = GpuDataManager(ma_mem, 0, nma, False)
diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
index 5093d6422..a10fceff2 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py
@@ -120,6 +120,10 @@ def _setup_kernels(self):
             # create buffer arrays
             ash = (fpc * nmodes,) + tuple(geo.shape)
             aux = np.zeros(ash, dtype=np.complex64)
+            mem = cuda.mem_get_info()[0]
+            if not int(mem) // aux.nbytes:
+                log(1,"Cannot fit memory into device, if possible reduce frames per block or nr. of modes. Exiting...")
+                raise SystemExit("ptypy has been exited.")
             kern.aux = gpuarray.to_gpu(aux)
 
             # setup kernels, one for each SCAN.
diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py
index 193042895..6c54a8074 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py
@@ -69,7 +69,8 @@ def _setup_kernels(self):
         # TODO grow blocks dynamically
         nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
         nma = min(fit, MAX_BLOCKS)
-        log(4, 'Free memory on device: %.2f GB' % (float(mem)/1e9))
+        log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
+        log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3)))
         log(4, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
         # reset memory or create new
         self.ex_data = GpuDataManager(ex_mem, 0, nex, True)
@@ -132,7 +133,7 @@ def engine_prepare(self):
             prep.mag = cuda.pagelocked_empty(mag.shape, mag.dtype, order="C", mem_flags=4)
             prep.mag[:] = mag
 
-            log(4, 'Free memory on device: %.2f GB' % (float(cuda.mem_get_info()[0])/1e9))
+            log(4, 'Free memory on device: {:.2f} GB'.format(float(cuda.mem_get_info()[0])/(1024**3)))
             self.ex_data.add_data_block()
             self.ma_data.add_data_block()
             self.mag_data.add_data_block()
diff --git a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py
index 881cb33a2..d45a67218 100644
--- a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py
+++ b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py
@@ -166,7 +166,9 @@ def _setup_kernels(self):
         nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS)
         nma = min(fit, MAX_BLOCKS)
 
-        log(3, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
+        log(4, 'Free memory available: {:.2f} GB'.format(float(mem)/(1024**3)))
+        log(4, 'Memory to be allocated per block: {:.2f} GB'.format(float(blk)/(1024**3)))
+        log(4, 'PyCUDA max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma))
         # reset memory or create new
         self.ex_data = GpuDataManager(ex_mem, 0, nex, True)
         self.ma_data = GpuDataManager(ma_mem, 0, nma, False)
diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py
index 9767ff370..8f7378715 100644
--- a/ptypy/accelerate/cuda_pycuda/kernels.py
+++ b/ptypy/accelerate/cuda_pycuda/kernels.py
@@ -21,6 +21,7 @@ def choose_fft(fft_type, arr_shape):
         try:
             from ptypy.accelerate.cuda_pycuda.cufft import FFT_cuda as FFT
         except:
+            import filtered_cufft
             logger.warning('Unable to import cufft version - using Reikna instead')
             from ptypy.accelerate.cuda_pycuda.fft import FFT
     elif fft_type=='skcuda':
diff --git a/templates/engines/cupy/moonflower_DM_ML_cupy.py b/templates/engines/cupy/moonflower_DM_ML_cupy.py
new file mode 100644
index 000000000..71874b265
--- /dev/null
+++ b/templates/engines/cupy/moonflower_DM_ML_cupy.py
@@ -0,0 +1,68 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 600
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DM_cupy'
+p.engines.engine00.numiter = 60
+p.engines.engine00.numiter_contiguous = 10
+p.engines.engine00.probe_support = 0.5
+
+# attach a reconstrucion engine
+p.engines.engine01 = u.Param()
+p.engines.engine01.name = 'ML_cupy'
+p.engines.engine01.numiter = 20
+p.engines.engine01.numiter_contiguous = 5
+p.engines.engine01.reg_del2 = False
+p.engines.engine01.reg_del2_amplitude = 1.
+p.engines.engine01.floating_intensities = False
+p.engines.engine01.probe_support = 0.5
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_DM_cupy.py b/templates/engines/cupy/moonflower_DM_cupy.py
new file mode 100644
index 000000000..db49581ab
--- /dev/null
+++ b/templates/engines/cupy/moonflower_DM_cupy.py
@@ -0,0 +1,57 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 1000
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=4)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DM_cupy'
+p.engines.engine00.numiter = 20
+p.engines.engine00.numiter_contiguous = 10
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_DM_cupy_nostream.py b/templates/engines/cupy/moonflower_DM_cupy_nostream.py
new file mode 100644
index 000000000..f95e83a23
--- /dev/null
+++ b/templates/engines/cupy/moonflower_DM_cupy_nostream.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 1000
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=4)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'DM_cupy_nostream'
+p.engines.engine00.numiter = 20
+p.engines.engine00.numiter_contiguous = 10
+p.engines.engine00.probe_update_start = 1
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_EPIE_ML_cupy.py b/templates/engines/cupy/moonflower_EPIE_ML_cupy.py
new file mode 100644
index 000000000..83e2dc06b
--- /dev/null
+++ b/templates/engines/cupy/moonflower_EPIE_ML_cupy.py
@@ -0,0 +1,75 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'GradFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+p.scans.MF.illumination=u.Param()
+p.scans.MF.illumination.diversity = None
+
+p.scans.MF.coherence=u.Param()
+p.scans.MF.coherence.num_probe_modes = 1
+p.scans.MF.coherence.num_object_modes = 1
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'EPIE_cupy'
+p.engines.engine00.numiter = 200
+p.engines.engine00.probe_center_tol = None
+p.engines.engine00.compute_log_likelihood = True
+p.engines.engine00.object_norm_is_global = True
+p.engines.engine00.alpha = 1
+p.engines.engine00.beta = 1
+p.engines.engine00.probe_update_start = 2
+
+p.engines.engine01 = u.Param()
+p.engines.engine01.name = 'ML_cupy'
+p.engines.engine01.ML_type = 'Gaussian'
+p.engines.engine01.reg_del2 = True 
+p.engines.engine01.reg_del2_amplitude = 1.
+p.engines.engine01.scale_precond = True
+p.engines.engine01.scale_probe_object = 1.
+p.engines.engine01.numiter = 100
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
\ No newline at end of file
diff --git a/templates/engines/cupy/moonflower_EPIE_cupy.py b/templates/engines/cupy/moonflower_EPIE_cupy.py
new file mode 100644
index 000000000..57ce65bb0
--- /dev/null
+++ b/templates/engines/cupy/moonflower_EPIE_cupy.py
@@ -0,0 +1,59 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'GradFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'EPIE_cupy'
+p.engines.engine00.numiter = 200
+p.engines.engine00.probe_center_tol = None
+p.engines.engine00.compute_log_likelihood = True
+p.engines.engine00.object_norm_is_global = True
+p.engines.engine00.alpha = 1
+p.engines.engine00.beta = 1
+p.engines.engine00.probe_update_start = 2
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_ML_ML_cupy.py b/templates/engines/cupy/moonflower_ML_ML_cupy.py
new file mode 100644
index 000000000..39bd2871c
--- /dev/null
+++ b/templates/engines/cupy/moonflower_ML_ML_cupy.py
@@ -0,0 +1,72 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_cupy'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+p.engines.engine00.smooth_gradient = 20.
+p.engines.engine00.smooth_gradient_decay = 1/50.
+p.engines.engine00.floating_intensities = False
+
+p.engines.engine01 = u.Param()
+p.engines.engine01.name = 'ML_cupy'
+p.engines.engine01.numiter = 20
+p.engines.engine01.numiter_contiguous = 5
+p.engines.engine01.reg_del2 = False
+p.engines.engine01.reg_del2_amplitude = 1.
+p.engines.engine01.floating_intensities = False
+p.engines.engine01.probe_support = 0.5
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_ML_cupy.py b/templates/engines/cupy/moonflower_ML_cupy.py
new file mode 100644
index 000000000..af0427cfb
--- /dev/null
+++ b/templates/engines/cupy/moonflower_ML_cupy.py
@@ -0,0 +1,63 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 100
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'ML_cupy'
+p.engines.engine00.numiter = 300
+p.engines.engine00.numiter_contiguous = 5
+p.engines.engine00.reg_del2 = True                      # Whether to use a Gaussian prior (smoothing) regularizer
+p.engines.engine00.reg_del2_amplitude = 1.             # Amplitude of the Gaussian prior if used
+p.engines.engine00.scale_precond = True
+p.engines.engine00.smooth_gradient = 20.
+p.engines.engine00.smooth_gradient_decay = 1/50.
+p.engines.engine00.floating_intensities = False
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_RAAR_ML_cupy.py b/templates/engines/cupy/moonflower_RAAR_ML_cupy.py
new file mode 100644
index 000000000..82688880d
--- /dev/null
+++ b/templates/engines/cupy/moonflower_RAAR_ML_cupy.py
@@ -0,0 +1,69 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 400
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 600
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=1)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'RAAR_cupy'
+p.engines.engine00.numiter = 60
+p.engines.engine00.numiter_contiguous = 10
+p.engines.engine00.probe_support = 0.5
+p.engines.engine00.beta = 0.9
+
+# attach a reconstrucion engine
+p.engines.engine01 = u.Param()
+p.engines.engine01.name = 'ML_cupy'
+p.engines.engine01.numiter = 20
+p.engines.engine01.numiter_contiguous = 5
+p.engines.engine01.reg_del2 = False  
+p.engines.engine01.reg_del2_amplitude = 1. 
+p.engines.engine01.floating_intensities = False
+p.engines.engine01.probe_support = 0.5
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_RAAR_cupy.py b/templates/engines/cupy/moonflower_RAAR_cupy.py
new file mode 100644
index 000000000..45c93b98e
--- /dev/null
+++ b/templates/engines/cupy/moonflower_RAAR_cupy.py
@@ -0,0 +1,58 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'BlockFull' 
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 1000
+p.scans.MF.data.save = None
+
+p.scans.MF.illumination = u.Param(diversity=None)
+p.scans.MF.coherence = u.Param(num_probe_modes=4)
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'RAAR_cupy'
+p.engines.engine00.numiter = 20
+p.engines.engine00.numiter_contiguous = 10
+p.engines.engine00.beta = 0.9
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/cupy/moonflower_SDR_cupy.py b/templates/engines/cupy/moonflower_SDR_cupy.py
new file mode 100644
index 000000000..505954de9
--- /dev/null
+++ b/templates/engines/cupy/moonflower_SDR_cupy.py
@@ -0,0 +1,61 @@
+"""
+This script is a test for ptychographic reconstruction in the absence
+of actual data. It uses the test Scan class
+`ptypy.core.data.MoonFlowerScan` to provide "data".
+"""
+from ptypy.core import Ptycho
+from ptypy import utils as u
+import ptypy
+ptypy.load_gpu_engines(arch="cupy")
+
+import tempfile
+tmpdir = tempfile.gettempdir()
+
+p = u.Param()
+
+# for verbose output
+p.verbose_level = "info"
+
+# Frames per block
+p.frames_per_block = 200
+
+# set home path
+p.io = u.Param()
+p.io.home =  "/".join([tmpdir, "ptypy"])
+p.io.autosave = u.Param(active=False)
+p.io.autoplot = u.Param(active=False)
+p.io.interaction = u.Param(active=False)
+
+# max 200 frames (128x128px) of diffraction data
+p.scans = u.Param()
+p.scans.MF = u.Param()
+# now you have to specify which ScanModel to use with scans.XX.name,
+# just as you have to give 'name' for engines and PtyScan subclasses.
+p.scans.MF.name = 'Full'
+p.scans.MF.data= u.Param()
+p.scans.MF.data.name = 'MoonFlowerScan'
+p.scans.MF.data.shape = 128
+p.scans.MF.data.num_frames = 200
+p.scans.MF.data.save = None
+
+# position distance in fraction of illumination frame
+p.scans.MF.data.density = 0.2
+# total number of photon in empty beam
+p.scans.MF.data.photons = 1e8
+# Gaussian FWHM of possible detector blurring
+p.scans.MF.data.psf = 0.0
+p.scans.MF.coherence = u.Param()
+p.scans.MF.coherence.num_probe_modes = 1
+
+# attach a reconstrucion engine
+p.engines = u.Param()
+p.engines.engine00 = u.Param()
+p.engines.engine00.name = 'SDR_cupy'
+p.engines.engine00.numiter = 500
+p.engines.engine00.sigma = 0.5
+p.engines.engine00.tau = 0.1
+p.engines.engine00.probe_update_start = 2
+
+# prepare and run
+if __name__ == "__main__":
+    P = Ptycho(p,level=5)
diff --git a/templates/engines/moonflower_DM_ocl.py b/templates/engines/legacy/moonflower_DM_ocl.py
similarity index 100%
rename from templates/engines/moonflower_DM_ocl.py
rename to templates/engines/legacy/moonflower_DM_ocl.py
diff --git a/templates/engines/moonflower_DM.py b/templates/engines/numpy/moonflower_DM.py
similarity index 100%
rename from templates/engines/moonflower_DM.py
rename to templates/engines/numpy/moonflower_DM.py
diff --git a/templates/engines/moonflower_DM_ML.py b/templates/engines/numpy/moonflower_DM_ML.py
similarity index 100%
rename from templates/engines/moonflower_DM_ML.py
rename to templates/engines/numpy/moonflower_DM_ML.py
diff --git a/templates/engines/moonflower_EPIE.py b/templates/engines/numpy/moonflower_EPIE.py
similarity index 100%
rename from templates/engines/moonflower_EPIE.py
rename to templates/engines/numpy/moonflower_EPIE.py
diff --git a/templates/engines/moonflower_ML_Euclid.py b/templates/engines/numpy/moonflower_ML_Euclid.py
similarity index 100%
rename from templates/engines/moonflower_ML_Euclid.py
rename to templates/engines/numpy/moonflower_ML_Euclid.py
diff --git a/templates/engines/moonflower_ML_Gaussian.py b/templates/engines/numpy/moonflower_ML_Gaussian.py
similarity index 100%
rename from templates/engines/moonflower_ML_Gaussian.py
rename to templates/engines/numpy/moonflower_ML_Gaussian.py
diff --git a/templates/engines/moonflower_ML_ML.py b/templates/engines/numpy/moonflower_ML_ML.py
similarity index 100%
rename from templates/engines/moonflower_ML_ML.py
rename to templates/engines/numpy/moonflower_ML_ML.py
diff --git a/templates/engines/moonflower_ML_Poisson.py b/templates/engines/numpy/moonflower_ML_Poisson.py
similarity index 100%
rename from templates/engines/moonflower_ML_Poisson.py
rename to templates/engines/numpy/moonflower_ML_Poisson.py
diff --git a/templates/engines/moonflower_RAAR.py b/templates/engines/numpy/moonflower_RAAR.py
similarity index 100%
rename from templates/engines/moonflower_RAAR.py
rename to templates/engines/numpy/moonflower_RAAR.py
diff --git a/templates/engines/moonflower_RAAR_ML.py b/templates/engines/numpy/moonflower_RAAR_ML.py
similarity index 100%
rename from templates/engines/moonflower_RAAR_ML.py
rename to templates/engines/numpy/moonflower_RAAR_ML.py
diff --git a/templates/engines/moonflower_SDR.py b/templates/engines/numpy/moonflower_SDR.py
similarity index 100%
rename from templates/engines/moonflower_SDR.py
rename to templates/engines/numpy/moonflower_SDR.py
diff --git a/templates/engines/moonflower_DM_ML_pycuda.py b/templates/engines/pycuda/moonflower_DM_ML_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_DM_ML_pycuda.py
rename to templates/engines/pycuda/moonflower_DM_ML_pycuda.py
diff --git a/templates/engines/moonflower_DM_pycuda.py b/templates/engines/pycuda/moonflower_DM_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_DM_pycuda.py
rename to templates/engines/pycuda/moonflower_DM_pycuda.py
diff --git a/templates/engines/moonflower_DM_pycuda_nostream.py b/templates/engines/pycuda/moonflower_DM_pycuda_nostream.py
similarity index 100%
rename from templates/engines/moonflower_DM_pycuda_nostream.py
rename to templates/engines/pycuda/moonflower_DM_pycuda_nostream.py
diff --git a/templates/engines/moonflower_EPIE_ML_pycuda.py b/templates/engines/pycuda/moonflower_EPIE_ML_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_EPIE_ML_pycuda.py
rename to templates/engines/pycuda/moonflower_EPIE_ML_pycuda.py
diff --git a/templates/engines/moonflower_EPIE_pycuda.py b/templates/engines/pycuda/moonflower_EPIE_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_EPIE_pycuda.py
rename to templates/engines/pycuda/moonflower_EPIE_pycuda.py
diff --git a/templates/engines/moonflower_ML_ML_pycuda.py b/templates/engines/pycuda/moonflower_ML_ML_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_ML_ML_pycuda.py
rename to templates/engines/pycuda/moonflower_ML_ML_pycuda.py
diff --git a/templates/engines/moonflower_ML_pycuda.py b/templates/engines/pycuda/moonflower_ML_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_ML_pycuda.py
rename to templates/engines/pycuda/moonflower_ML_pycuda.py
diff --git a/templates/engines/moonflower_RAAR_ML_pycuda.py b/templates/engines/pycuda/moonflower_RAAR_ML_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_RAAR_ML_pycuda.py
rename to templates/engines/pycuda/moonflower_RAAR_ML_pycuda.py
diff --git a/templates/engines/moonflower_RAAR_pycuda.py b/templates/engines/pycuda/moonflower_RAAR_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_RAAR_pycuda.py
rename to templates/engines/pycuda/moonflower_RAAR_pycuda.py
diff --git a/templates/engines/moonflower_SDR_pycuda.py b/templates/engines/pycuda/moonflower_SDR_pycuda.py
similarity index 100%
rename from templates/engines/moonflower_SDR_pycuda.py
rename to templates/engines/pycuda/moonflower_SDR_pycuda.py
diff --git a/templates/engines/moonflower_DM_serial.py b/templates/engines/serial/moonflower_DM_serial.py
similarity index 100%
rename from templates/engines/moonflower_DM_serial.py
rename to templates/engines/serial/moonflower_DM_serial.py
diff --git a/templates/engines/moonflower_EPIE_serial.py b/templates/engines/serial/moonflower_EPIE_serial.py
similarity index 100%
rename from templates/engines/moonflower_EPIE_serial.py
rename to templates/engines/serial/moonflower_EPIE_serial.py
diff --git a/templates/engines/moonflower_ML_serial.py b/templates/engines/serial/moonflower_ML_serial.py
similarity index 100%
rename from templates/engines/moonflower_ML_serial.py
rename to templates/engines/serial/moonflower_ML_serial.py
diff --git a/templates/engines/moonflower_RAAR_serial.py b/templates/engines/serial/moonflower_RAAR_serial.py
similarity index 100%
rename from templates/engines/moonflower_RAAR_serial.py
rename to templates/engines/serial/moonflower_RAAR_serial.py
diff --git a/templates/engines/moonflower_SDR_serial.py b/templates/engines/serial/moonflower_SDR_serial.py
similarity index 100%
rename from templates/engines/moonflower_SDR_serial.py
rename to templates/engines/serial/moonflower_SDR_serial.py
diff --git a/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py
index 00d785859..96cd12560 100644
--- a/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py
+++ b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py
@@ -39,16 +39,16 @@ def get_reverse_cuFFT(f, stream,
 
 class FftScalingTest(CupyCudaTest):
 
-    def get_input(self):
-        rows = cols = 32
+    def get_input(self, size):
+        rows = cols = size
         batches = 1
         f = np.ones(shape=(batches, rows, cols), dtype=COMPLEX_TYPE)
         return f
 
     #### Trivial foward transform tests ####
 
-    def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True):
-        f = self.get_input()
+    def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True, size=32):
+        f = self.get_input(size)
         f_d = cp.asarray(f)
         if preffact is not None:
             pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64)
@@ -71,7 +71,7 @@ def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=Tr
         scale = 1.0 if not symmetric else 1.0 / np.sqrt(elements)
         expected = elements * scale * preffact * postfact
         self.assertAlmostEqual(f_back[0,0,0], expected)
-        np.testing.assert_array_almost_equal(f_back.flat[1:], 0)
+        np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=5)
 
     def test_fwd_noscale_cufft(self):
         self.fwd_test(False, get_forward_cuFFT)
@@ -121,11 +121,34 @@ def test_prepostfilt_fwd_scale_cufft(self):
     def test_prepostfilt_fwd_scale_cufft_cupy(self):
         self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False)
 
+    def test_fwd_not_power_two_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, external=False, size=20)
 
+    def test_fwd_not_power_two_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, external=False, size=20)
+
+    def test_prefilt_fwd_not_power_two_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, preffact=2.0, external=False, size=20)
+
+    def test_prefilt_fwd_not_power_two_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, preffact=2.0, external=False, size=20)
+
+    def test_postfilt_fwd_not_power_two_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, postfact=2.0, external=False, size=20)
+
+    def test_postfilt_fwd_not_power_two_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, postfact=2.0, external=False, size=20)
+
+    def test_prepostfilt_not_power_two_fwd_noscale_cufft_cupy(self):
+        self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False, size=20)
+
+    def test_prepostfilt_not_power_two_fwd_scale_cufft_cupy(self):
+        self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False, size=20)
+        
     ############# Trivial inverse transform tests #########
 
-    def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True):
-        f = self.get_input()
+    def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True, size=32):
+        f = self.get_input(size)
         f_d = cp.asarray(f)
         if preffact is not None:
             pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64)
@@ -148,7 +171,7 @@ def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=Tr
         scale = 1.0 if not symmetric else np.sqrt(elements)
         expected = scale * preffact * postfact
         self.assertAlmostEqual(f_back[0,0,0], expected)
-        np.testing.assert_array_almost_equal(f_back.flat[1:], 0)
+        np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=5)
 
 
     def test_rev_noscale_cufft(self):
@@ -199,6 +222,29 @@ def test_prepostfilt_rev_scale_cufft(self):
     def test_prepostfilt_rev_scale_cufft_cupy(self):
         self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False)
 
+    def test_rev_not_power_two_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, external=False, size=20)
+
+    def test_rev_not_power_two_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, external=False, size=20)
+
+    def test_prefilt_rev_not_power_two_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, preffact=1.5, external=False, size=20)
+
+    def test_prefilt_rev_not_power_two_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, preffact=1.5, external=False, size=20)
+
+    def test_postfilt_rev_not_power_two_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, postfact=1.5, external=False, size=20)
+
+    def test_postfilt_rev_not_power_two_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, postfact=1.5, external=False, size=20)
+
+    def test_prepostfilt_rev_not_power_two_noscale_cufft_cupy(self):
+        self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False, size=20)
+
+    def test_prepostfilt_rev_not_power_two_scale_cufft_cupy(self):
+        self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False, size=20)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py
index 8449adae0..b16a92902 100644
--- a/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py
+++ b/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py
@@ -92,6 +92,7 @@ def test_fwd_noscale_reikna(self):
     def test_fwd_noscale_cufft(self):
         self.fwd_test(False, get_forward_cuFFT)
     
+    @unittest.skip("Skcuda is currently broken")
     def test_fwd_noscale_cufft_skcuda(self):
         self.fwd_test(False, get_forward_cuFFT, external=False)
 
@@ -101,6 +102,7 @@ def test_fwd_scale_reikna(self):
     def test_fwd_scale_cufft(self):
         self.fwd_test(True, get_forward_cuFFT)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_fwd_scale_cufft_skcuda(self):
         self.fwd_test(True, get_forward_cuFFT, external=False)
 
@@ -110,6 +112,7 @@ def test_prefilt_fwd_noscale_reikna(self):
     def test_prefilt_fwd_noscale_cufft(self):
         self.fwd_test(False, get_forward_cuFFT, preffact=2.0)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_prefilt_fwd_noscale_cufft_skcuda(self):
         self.fwd_test(False, get_forward_cuFFT, preffact=2.0, external=False)
 
@@ -119,6 +122,7 @@ def test_prefilt_fwd_scale_reikna(self):
     def test_prefilt_fwd_scale_cufft(self):
         self.fwd_test(True, get_forward_cuFFT, preffact=2.0)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_prefilt_fwd_scale_cufft_skcuda(self):
         self.fwd_test(True, get_forward_cuFFT, preffact=2.0, external=False)
 
@@ -128,6 +132,7 @@ def test_postfilt_fwd_noscale_reikna(self):
     def test_postfilt_fwd_noscale_cufft(self):
         self.fwd_test(False, get_forward_cuFFT, postfact=2.0)
     
+    @unittest.skip("Skcuda is currently broken")
     def test_postfilt_fwd_noscale_cufft_skcuda(self):
         self.fwd_test(False, get_forward_cuFFT, postfact=2.0, external=False)
 
@@ -137,6 +142,7 @@ def test_postfilt_fwd_scale_reikna(self):
     def test_postfilt_fwd_scale_cufft(self):
         self.fwd_test(True, get_forward_cuFFT, postfact=2.0)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_postfilt_fwd_scale_cufft_skcuda(self):
         self.fwd_test(True, get_forward_cuFFT, postfact=2.0, external=False)
 
@@ -146,6 +152,7 @@ def test_prepostfilt_fwd_noscale_reikna(self):
     def test_prepostfilt_fwd_noscale_cufft(self):
         self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5)
     
+    @unittest.skip("Skcuda is currently broken")
     def test_prepostfilt_fwd_noscale_cufft_skcuda(self):
         self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False)
 
@@ -155,6 +162,7 @@ def test_prepostfilt_fwd_scale_reikna(self):
     def test_prepostfilt_fwd_scale_cufft(self):
         self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_prepostfilt_fwd_scale_cufft_skcuda(self):
         self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False)
 
@@ -194,6 +202,7 @@ def test_rev_noscale_reikna(self):
     def test_rev_noscale_cufft(self):
         self.rev_test(False, get_reverse_cuFFT)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_rev_noscale_cufft_skcuda(self):
         self.rev_test(False, get_reverse_cuFFT, external=False)
 
@@ -203,6 +212,7 @@ def test_rev_scale_reikna(self):
     def test_rev_scale_cufft(self):
         self.rev_test(True, get_reverse_cuFFT)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_rev_scale_cufft_skcuda(self):
         self.rev_test(True, get_reverse_cuFFT, external=False)
 
@@ -212,6 +222,7 @@ def test_prefilt_rev_noscale_reikna(self):
     def test_prefilt_rev_noscale_cufft(self):
         self.rev_test(False, get_reverse_cuFFT, preffact=1.5)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_prefilt_rev_noscale_cufft_skcuda(self):
         self.rev_test(False, get_reverse_cuFFT, preffact=1.5, external=False)
 
@@ -221,6 +232,7 @@ def test_prefilt_rev_scale_reikna(self):
     def test_prefilt_rev_scale_cufft(self):
         self.rev_test(True, get_reverse_cuFFT, preffact=1.5)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_prefilt_rev_scale_cufft_skcuda(self):
         self.rev_test(True, get_reverse_cuFFT, preffact=1.5, external=False)
 
@@ -230,6 +242,7 @@ def test_postfilt_rev_noscale_reikna(self):
     def test_postfilt_rev_noscale_cufft(self):
         self.rev_test(False, get_reverse_cuFFT, postfact=1.5)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_postfilt_rev_noscale_cufft_skcuda(self):
         self.rev_test(False, get_reverse_cuFFT, postfact=1.5, external=False)
 
@@ -239,6 +252,7 @@ def test_postfilt_rev_scale_reikna(self):
     def test_postfilt_rev_scale_cufft(self):
         self.rev_test(True, get_reverse_cuFFT, postfact=1.5)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_postfilt_rev_scale_cufft_skcuda(self):
         self.rev_test(True, get_reverse_cuFFT, postfact=1.5, external=False)
 
@@ -248,6 +262,7 @@ def test_prepostfilt_rev_noscale_reikna(self):
     def test_prepostfilt_rev_noscale_cufft(self):
         self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_prepostfilt_rev_noscale_cufft_skcuda(self):
         self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False)
 
@@ -257,6 +272,7 @@ def test_prepostfilt_rev_scale_reikna(self):
     def test_prepostfilt_rev_scale_cufft(self):
         self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0)
 
+    @unittest.skip("Skcuda is currently broken")
     def test_prepostfilt_rev_scale_cufft_skcuda(self):
         self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False)
 

From a0a4b7e030860878e7bd2c080b81938eb6f6b3dc Mon Sep 17 00:00:00 2001
From: "Benedikt J. Daurer" <bjdaurer@gmail.com>
Date: Fri, 2 Feb 2024 16:49:51 +0000
Subject: [PATCH 36/37] Release notes for 0.8 (#532)

* started draft for release notes for 0.8

* updated message about threepie

* updated release notes
---
 release_notes.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/release_notes.md b/release_notes.md
index 8249582a9..bed47e241 100644
--- a/release_notes.md
+++ b/release_notes.md
@@ -1,5 +1,10 @@
 # PtyPy 0.8 release notes
 
+We're excited to bring you a new release, with new engines, CuPy support and
+other improvements.
+
+## GPU acceleration
+
 An alternative CUDA implementation based on [`cupy`](https://cupy.dev/) 
 has been implemented, providing the same feature as the `PyCuda` based
 engine. 
@@ -8,6 +13,24 @@ It can be imported using
 import ptypy
 ptypy.load_gpu_engines('cupy')
 ```
+which will load engines such as ```DM_cupy```, ```RAAR_cupy```, ```ML_cupy```, ```EPIE_cupy``` and ```SDR_cupy```.
+
+
+## Engine updates
+
+* New WASP algorithm including GPU acceleration, available as custom engines by importing the module from ```ptypy.custom``` (thanks to Timothy Poon)
+* Experimental implementation of the ThreePIE algorithm (multislice) which is available as custom engine by importing the module
+  ```ptypy.custom.threepIE``` and using the engine as ```ThreePIE``` (thanks to Yiran Lu and Maik Kahnt)
+* We provide templates for both algorithms, we are working on additional documentation
+
+## Additional build changes
+
+* Added Euclidean noise model to core ML engine (thanks to Jari Fowkes)
+* New saving mode "used_params" that will save parameters used during reconstruction into the output .ptyr file
+* Introducing core functions ```copy_state``` and ```restore_data``` which allow for more efficient parameter sweeps
+
+## Breaking change
+Removed NCCL support from pycuda engines to avoid dependency on CuPy. The new CuPy engines have been implemented with NCCL support. 
 
 
 # PtyPy 0.7.1 release notes

From 1798bd13fc3f895fa350b01d28562261da5d80fb Mon Sep 17 00:00:00 2001
From: Timothy Poon <62692924+ptim0626@users.noreply.github.com>
Date: Fri, 2 Feb 2024 19:12:41 +0000
Subject: [PATCH 37/37] Remove old argument in WASP pycuda because of #520
 (#533)

---
 ptypy/custom/WASP_pycuda.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ptypy/custom/WASP_pycuda.py b/ptypy/custom/WASP_pycuda.py
index f68b48434..e96bce1b5 100644
--- a/ptypy/custom/WASP_pycuda.py
+++ b/ptypy/custom/WASP_pycuda.py
@@ -69,9 +69,9 @@ def engine_initialize(self):
         Prepare for reconstruction.
         """
         # Context, Multi GPU communicator and Stream (needs to be in this order)
-        self.context, self.queue = get_context(new_context=True, new_queue=False)
+        self.context, self.queue = get_context(new_queue=False)
         self.multigpu = get_multi_gpu_communicator()
-        self.context, self.queue = get_context(new_context=False, new_queue=True)
+        self.context, self.queue = get_context(new_queue=True)
 
         # initialise kernels for centring probe if required
         if self.p.probe_center_tol is not None: