From d886cf2feaea2819f6943b601ef63aff4c95ad7f Mon Sep 17 00:00:00 2001 From: Timothy Poon <62692924+ptim0626@users.noreply.github.com> Date: Tue, 24 Jan 2023 09:57:33 +0000 Subject: [PATCH 01/37] Use primary context when setting up pycuda-related tests (#468) * Use primary context when setting up pycuda-related tests * Set context to None in tests tear-down as what has been done in pycuda * Use primary context for multi-gpu mpi tests --- .../cuda_pycuda_tests/__init__.py | 10 ++++++-- .../cuda_pycuda_tests/multi_gpu_test.py | 23 ++++++++++++------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/test/accelerate_tests/cuda_pycuda_tests/__init__.py b/test/accelerate_tests/cuda_pycuda_tests/__init__.py index 04582430f..ab94a4eb6 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/__init__.py +++ b/test/accelerate_tests/cuda_pycuda_tests/__init__.py @@ -26,7 +26,13 @@ class PyCudaTest(unittest.TestCase): def setUp(self): import sys np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf) - self.ctx = make_default_context() + + def _retain_primary_context(dev): + ctx = dev.retain_primary_context() + ctx.push() + return ctx + self.ctx = make_default_context(_retain_primary_context) + self.stream = cuda.Stream() # enable assertions in CUDA kernels for testing if not 'perf' in self._testMethodName: @@ -37,7 +43,7 @@ def setUp(self): def tearDown(self): np.set_printoptions() self.ctx.pop() - self.ctx.detach() + self.ctx = None if not 'perf' in self._testMethodName: cuda_pycuda.debug_options = self.opts_old diff --git a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py index 64cc5110d..fdc34a528 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/multi_gpu_test.py @@ -9,12 +9,13 @@ if have_pycuda(): from pycuda import gpuarray import pycuda.driver as cuda + from pycuda.tools import make_default_context from ptypy.accelerate.cuda_pycuda import multi_gpu as mgpu from ptypy.utils import parallel from pkg_resources import parse_version -class GpuDataTest(PyCudaTest): +class GpuDataTest(unittest.TestCase): """ This is a test class for MPI - to really check if it all works, it needs to be run as: @@ -27,20 +28,26 @@ class GpuDataTest(PyCudaTest): needs to be set, mpi4py version 3.1.0+ used, a pycuda build from master, and a cuda-aware MPI version. + + To check if it is a cuda-aware MPI version: + ompi_info --parsable --all | grep mpi_built_with_cuda_support:value """ def setUp(self): if parallel.rank_local < cuda.Device.count(): - self.device = cuda.Device(parallel.rank_local) - self.ctx = self.device.make_context() - self.ctx.push() + def _retain_primary_context(dev): + ctx = dev.retain_primary_context() + ctx.push() + return ctx + self.ctx = make_default_context(_retain_primary_context) + self.device = self.ctx.get_device() else: self.ctx = None def tearDown(self): if self.ctx is not None: self.ctx.pop() - self.ctx.detach() + self.ctx = None @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0") def test_version(self): @@ -53,7 +60,7 @@ def test_compute_mode(self): attr = cuda.Context.get_device().get_attributes() self.assertIn(cuda.device_attribute.COMPUTE_MODE, attr) mode = attr[cuda.device_attribute.COMPUTE_MODE] - self.assertIn(mode, + self.assertIn(mode, [cuda.compute_mode.DEFAULT, cuda.compute_mode.PROHIBITED, cuda.compute_mode.EXCLUSIVE_PROCESS] ) @@ -71,7 +78,7 @@ def multigpu_tester(self, com): def test_multigpu_auto(self): self.multigpu_tester(mgpu.get_multi_gpu_communicator()) - + def test_multigpu_mpi(self): self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi()) @@ -81,4 +88,4 @@ def test_multigpu_cudampi(self): @unittest.skipIf(not mgpu.have_nccl, "NCCL not available") def test_multigpu_nccl(self): - self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl()) \ No newline at end of file + self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl()) From 44e505515f7cb979a697df029881fa2644ec48eb Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 24 Jan 2023 13:24:37 +0000 Subject: [PATCH 02/37] CuPy backend (#469) * Cupy backend + most array utils kernels passing / tested * Import pybind11 later, to allow setup to run without and install the dependency * Adds all elementary kernels + tests to cupy backend * Adds complete cupy-based engines * make sure headers (.cuh) are copied to build dir * Finalising changes for the cupy engines * Adds cupy template for minimal prep and run * remove text from release notes Co-authored-by: Team GPU --- .gitignore | 1 + cufft/extensions.py | 2 +- cufft/setup.py | 1 + ptypy/__init__.py | 7 +- .../cuda => cuda_common}/__init__.py | 0 .../cuda => cuda_common}/abs2sum.cu | 3 +- .../cuda => cuda_common}/batched_multiply.cu | 3 +- .../cuda => cuda_common}/build_aux.cu | 0 .../cuda => cuda_common}/build_aux_no_ex.cu | 3 +- .../build_aux_position_correction.cu | 3 +- .../cuda => cuda_common}/build_exit.cu | 0 .../build_exit_alpha_tau.cu | 0 .../cuda => cuda_common}/clip_magnitudes.cu | 5 +- ptypy/accelerate/cuda_common/common.cuh | 12 + .../cuda => cuda_common}/convolution.cu | 3 +- .../{cuda_pycuda/cuda => cuda_common}/delx.cu | 3 +- .../{cuda_pycuda/cuda => cuda_common}/dot.cu | 4 +- .../cuda => cuda_common}/error_reduce.cu | 0 .../cuda => cuda_common}/exit_error.cu | 7 +- .../cuda => cuda_common}/fill3D.cu | 4 +- .../cuda => cuda_common}/fill_b.cu | 0 .../cuda => cuda_common}/fmag_all_update.cu | 5 +- .../fmag_update_nopbound.cu | 5 +- .../cuda => cuda_common}/fourier_deviation.cu | 7 +- .../cuda => cuda_common}/fourier_error.cu | 7 +- .../cuda => cuda_common}/fourier_error2.cu | 5 +- .../cuda => cuda_common}/fourier_update.cu | 5 +- .../cuda => cuda_common}/full_reduce.cu | 2 - .../cuda => cuda_common}/gd_main.cu | 3 +- .../cuda => cuda_common}/get_address.cu | 4 +- .../cuda => cuda_common}/intens_renorm.cu | 3 +- .../interpolated_shift.cu | 8 +- .../cuda => cuda_common}/log_likelihood.cu | 7 +- .../cuda => cuda_common}/make_a012.cu | 3 +- .../cuda => cuda_common}/make_aux.cu | 3 +- .../cuda => cuda_common}/make_exit.cu | 3 +- .../cuda => cuda_common}/make_model.cu | 3 +- .../cuda => cuda_common}/mass_center.cu | 0 .../cuda => cuda_common}/max_abs2.cu | 5 +- .../cuda => cuda_common}/ob_norm_local.cu | 7 +- .../cuda => cuda_common}/ob_update.cu | 3 +- .../cuda => cuda_common}/ob_update2.cu | 4 +- .../cuda => cuda_common}/ob_update2_ML.cu | 4 +- .../cuda => cuda_common}/ob_update_ML.cu | 3 +- .../cuda => cuda_common}/ob_update_local.cu | 3 +- .../cuda => cuda_common}/pr_norm_local.cu | 7 +- .../cuda => cuda_common}/pr_update.cu | 3 +- .../cuda => cuda_common}/pr_update2.cu | 4 +- .../cuda => cuda_common}/pr_update2_ML.cu | 4 +- .../cuda => cuda_common}/pr_update_ML.cu | 3 +- .../cuda => cuda_common}/pr_update_local.cu | 3 +- .../cuda => cuda_common}/transpose.cu | 3 +- .../update_addr_error_state.cu | 4 +- ptypy/accelerate/cuda_common/utils.py | 18 + ptypy/accelerate/cuda_cupy/__init__.py | 73 + .../accelerate/cuda_cupy/address_manglers.py | 81 + ptypy/accelerate/cuda_cupy/array_utils.py | 670 ++++++++ ptypy/accelerate/cuda_cupy/cufft.py | 171 +++ ptypy/accelerate/cuda_cupy/dependencies.yml | 17 + ptypy/accelerate/cuda_cupy/engines/ML_cupy.py | 804 ++++++++++ .../accelerate/cuda_cupy/engines/__init__.py | 0 .../cuda_cupy/engines/projectional_cupy.py | 636 ++++++++ .../engines/projectional_cupy_stream.py | 556 +++++++ .../cuda_cupy/engines/stochastic.py | 550 +++++++ ptypy/accelerate/cuda_cupy/kernels.py | 1345 +++++++++++++++++ ptypy/accelerate/cuda_cupy/mem_utils.py | 319 ++++ ptypy/accelerate/cuda_cupy/multi_gpu.py | 151 ++ ptypy/accelerate/cuda_cupy/porting_notes.md | 60 + ptypy/accelerate/cuda_pycuda/__init__.py | 9 +- ptypy/accelerate/cuda_pycuda/array_utils.py | 19 +- pyproject.toml | 2 +- .../ptypy_minimal_prep_and_run_cupy.py | 54 + .../cuda_cupy_tests/__init__.py | 33 + .../cuda_cupy_tests/address_manglers_test.py | 77 + .../cuda_cupy_tests/array_utils_test.py | 536 +++++++ .../auxiliary_wave_kernel_test.py | 666 ++++++++ .../derivatives_kernel_test.py | 330 ++++ .../cuda_cupy_tests/engine_tests.py | 172 +++ .../cuda_cupy_tests/engine_utils_test.py | 52 + .../cuda_cupy_tests/fft_scaling_test.py | 204 +++ .../cuda_cupy_tests/fft_setstream_test.py | 97 ++ .../fourier_update_kernel_test.py | 685 +++++++++ .../cuda_cupy_tests/gpudata_test.py | 265 ++++ .../gradient_descent_kernel_test.py | 327 ++++ .../cuda_cupy_tests/import_test.py | 10 + .../cuda_cupy_tests/multi_gpu_test.py | 74 + .../cuda_cupy_tests/po_update_kernel_test.py | 943 ++++++++++++ .../position_correction_kernel_test.py | 149 ++ .../propagation_kernel_test.py | 157 ++ .../cuda_pycuda_tests/fft_setstream_test.py | 1 + 90 files changed, 10351 insertions(+), 156 deletions(-) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/__init__.py (100%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/abs2sum.cu (94%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/batched_multiply.cu (95%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_aux.cu (100%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_aux_no_ex.cu (98%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_aux_position_correction.cu (96%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_exit.cu (100%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/build_exit_alpha_tau.cu (100%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/clip_magnitudes.cu (86%) create mode 100644 ptypy/accelerate/cuda_common/common.cuh rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/convolution.cu (99%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/delx.cu (99%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/dot.cu (93%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/error_reduce.cu (100%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/exit_error.cu (91%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fill3D.cu (95%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fill_b.cu (100%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fmag_all_update.cu (95%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fmag_update_nopbound.cu (95%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_deviation.cu (93%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_error.cu (93%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_error2.cu (96%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/fourier_update.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/full_reduce.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/gd_main.cu (95%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/get_address.cu (93%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/intens_renorm.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/interpolated_shift.cu (98%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/log_likelihood.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_a012.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_aux.cu (98%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_exit.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/make_model.cu (93%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/mass_center.cu (100%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/max_abs2.cu (96%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_norm_local.cu (93%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update2.cu (98%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update2_ML.cu (98%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update_ML.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/ob_update_local.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_norm_local.cu (93%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update2.cu (98%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update2_ML.cu (98%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update_ML.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/pr_update_local.cu (97%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/transpose.cu (96%) rename ptypy/accelerate/{cuda_pycuda/cuda => cuda_common}/update_addr_error_state.cu (94%) create mode 100644 ptypy/accelerate/cuda_common/utils.py create mode 100644 ptypy/accelerate/cuda_cupy/__init__.py create mode 100644 ptypy/accelerate/cuda_cupy/address_manglers.py create mode 100644 ptypy/accelerate/cuda_cupy/array_utils.py create mode 100644 ptypy/accelerate/cuda_cupy/cufft.py create mode 100644 ptypy/accelerate/cuda_cupy/dependencies.yml create mode 100644 ptypy/accelerate/cuda_cupy/engines/ML_cupy.py create mode 100644 ptypy/accelerate/cuda_cupy/engines/__init__.py create mode 100644 ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py create mode 100644 ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py create mode 100644 ptypy/accelerate/cuda_cupy/engines/stochastic.py create mode 100644 ptypy/accelerate/cuda_cupy/kernels.py create mode 100644 ptypy/accelerate/cuda_cupy/mem_utils.py create mode 100644 ptypy/accelerate/cuda_cupy/multi_gpu.py create mode 100644 ptypy/accelerate/cuda_cupy/porting_notes.md create mode 100644 templates/accelerate/ptypy_minimal_prep_and_run_cupy.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/__init__.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/array_utils_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/engine_tests.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/gpudata_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/import_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py create mode 100644 test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py diff --git a/.gitignore b/.gitignore index 70e473b91..3bebd583d 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ ghostdriver* .DS_Store .ipynb_checkpoints .clang-format +pip-wheel-metadata/ diff --git a/cufft/extensions.py b/cufft/extensions.py index 4fabf2d2c..545b43d04 100644 --- a/cufft/extensions.py +++ b/cufft/extensions.py @@ -4,7 +4,6 @@ import os, re import subprocess import sysconfig -import pybind11 from distutils.unixccompiler import UnixCCompiler from distutils.command.build_ext import build_ext @@ -98,6 +97,7 @@ def __init__(self, *args, **kwargs): self.LD_FLAGS = [archflag, "-lcufft_static", "-lculibos", "-ldl", "-lrt", "-lpthread", "-cudart shared"] self.NVCC_FLAGS = ["-dc", archflag] self.CXXFLAGS = ['"-fPIC"'] + import pybind11 pybind_includes = [pybind11.get_include(), sysconfig.get_path('include')] INCLUDES = pybind_includes + [self.CUDA['lib64'], module_dir] self.INCLUDES = ["-I%s" % ix for ix in INCLUDES] diff --git a/cufft/setup.py b/cufft/setup.py index 8cba2f560..5108ebf32 100644 --- a/cufft/setup.py +++ b/cufft/setup.py @@ -39,6 +39,7 @@ description='Extension of CuFFT to include pre- and post-filters using callbacks', packages=package_list, ext_modules=ext_modules, + install_requires=["pybind11"], cmdclass=cmdclass ) diff --git a/ptypy/__init__.py b/ptypy/__init__.py index 0ca662fb7..5b34c35fa 100644 --- a/ptypy/__init__.py +++ b/ptypy/__init__.py @@ -78,11 +78,16 @@ # Convenience loader for GPU engines def load_gpu_engines(arch='cuda'): - if arch=='cuda': + if arch in ['cuda', 'pycuda']: from .accelerate.cuda_pycuda.engines import projectional_pycuda from .accelerate.cuda_pycuda.engines import projectional_pycuda_stream from .accelerate.cuda_pycuda.engines import stochastic from .accelerate.cuda_pycuda.engines import ML_pycuda + if arch=='cupy': + from .accelerate.cuda_cupy.engines import projectional_cupy + from .accelerate.cuda_cupy.engines import projectional_cupy_stream + from .accelerate.cuda_cupy.engines import stochastic + from .accelerate.cuda_cupy.engines import ML_cupy if arch=='serial': from .accelerate.base.engines import projectional_serial from .accelerate.base.engines import projectional_serial_stream diff --git a/ptypy/accelerate/cuda_pycuda/cuda/__init__.py b/ptypy/accelerate/cuda_common/__init__.py similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/__init__.py rename to ptypy/accelerate/cuda_common/__init__.py diff --git a/ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu b/ptypy/accelerate/cuda_common/abs2sum.cu similarity index 94% rename from ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu rename to ptypy/accelerate/cuda_common/abs2sum.cu index 475a228bb..9783c20cc 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/abs2sum.cu +++ b/ptypy/accelerate/cuda_common/abs2sum.cu @@ -5,8 +5,7 @@ * - OUT_TYPE: can be float/double */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void abs2sum(const IN_TYPE* a, const int n, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu rename to ptypy/accelerate/cuda_common/batched_multiply.cu index 1263841b6..f91bb6d38 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/batched_multiply.cu +++ b/ptypy/accelerate/cuda_common/batched_multiply.cu @@ -8,8 +8,7 @@ * - MATH_TYPE: the data type used for computation (filter) */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void batched_multiply(const complex* input, complex* output, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu b/ptypy/accelerate/cuda_common/build_aux.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux.cu rename to ptypy/accelerate/cuda_common/build_aux.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu b/ptypy/accelerate/cuda_common/build_aux_no_ex.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu rename to ptypy/accelerate/cuda_common/build_aux_no_ex.cu index ee091c58e..d02500a1a 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_no_ex.cu +++ b/ptypy/accelerate/cuda_common/build_aux_no_ex.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void build_aux_no_ex(complex* auxilliary_wave, int aRows, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu b/ptypy/accelerate/cuda_common/build_aux_position_correction.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu rename to ptypy/accelerate/cuda_common/build_aux_position_correction.cu index 327040371..9d0f44fad 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/build_aux_position_correction.cu +++ b/ptypy/accelerate/cuda_common/build_aux_position_correction.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void build_aux_position_correction( complex* auxiliary_wave, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu b/ptypy/accelerate/cuda_common/build_exit.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/build_exit.cu rename to ptypy/accelerate/cuda_common/build_exit.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu b/ptypy/accelerate/cuda_common/build_exit_alpha_tau.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/build_exit_alpha_tau.cu rename to ptypy/accelerate/cuda_common/build_exit_alpha_tau.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu b/ptypy/accelerate/cuda_common/clip_magnitudes.cu similarity index 86% rename from ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu rename to ptypy/accelerate/cuda_common/clip_magnitudes.cu index 8128091f9..5db29dbe9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/clip_magnitudes.cu +++ b/ptypy/accelerate/cuda_common/clip_magnitudes.cu @@ -1,10 +1,7 @@ /** clip_magnitudes. * */ - #include - #include - #include - using thrust::complex; + #include "common.cuh" extern "C" __global__ void clip_magnitudes(IN_TYPE *arr, float clip_min, diff --git a/ptypy/accelerate/cuda_common/common.cuh b/ptypy/accelerate/cuda_common/common.cuh new file mode 100644 index 000000000..d2c022373 --- /dev/null +++ b/ptypy/accelerate/cuda_common/common.cuh @@ -0,0 +1,12 @@ +#pragma once + +#ifndef PTYPY_CUPY_NVTRC +// pycuda code +# include +using thrust::complex; + +#else +// cupy code +# include + +#endif \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu b/ptypy/accelerate/cuda_common/convolution.cu similarity index 99% rename from ptypy/accelerate/cuda_pycuda/cuda/convolution.cu rename to ptypy/accelerate/cuda_common/convolution.cu index ae42ecba5..d729fd067 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/convolution.cu +++ b/ptypy/accelerate/cuda_common/convolution.cu @@ -6,8 +6,7 @@ * A symmetric convolution kernel is assumed here */ -#include -using thrust::complex; +#include "common.cuh" /** Implements reflect-mode index wrapping * diff --git a/ptypy/accelerate/cuda_pycuda/cuda/delx.cu b/ptypy/accelerate/cuda_common/delx.cu similarity index 99% rename from ptypy/accelerate/cuda_pycuda/cuda/delx.cu rename to ptypy/accelerate/cuda_common/delx.cu index f2e8a934e..23ce09f05 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/delx.cu +++ b/ptypy/accelerate/cuda_common/delx.cu @@ -5,8 +5,7 @@ * - OUT_TYPE: the data type for the outputs */ -#include -using thrust::complex; +#include "common.cuh" /** Finite difference for forward/backward for any axis that is not the diff --git a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu b/ptypy/accelerate/cuda_common/dot.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/dot.cu rename to ptypy/accelerate/cuda_common/dot.cu index 21087abe3..3dfd909cf 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/dot.cu +++ b/ptypy/accelerate/cuda_common/dot.cu @@ -1,6 +1,4 @@ -#include -#include -using thrust::complex; +#include "common.cuh" template __device__ inline T dotmul(const T& a, const T& b) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu b/ptypy/accelerate/cuda_common/error_reduce.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/error_reduce.cu rename to ptypy/accelerate/cuda_common/error_reduce.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu b/ptypy/accelerate/cuda_common/exit_error.cu similarity index 91% rename from ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu rename to ptypy/accelerate/cuda_common/exit_error.cu index fdac52e46..2dded7e0f 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/exit_error.cu +++ b/ptypy/accelerate/cuda_common/exit_error.cu @@ -1,9 +1,4 @@ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu b/ptypy/accelerate/cuda_common/fill3D.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu rename to ptypy/accelerate/cuda_common/fill3D.cu index c3f03d8ca..ddaf6b2bc 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fill3D.cu +++ b/ptypy/accelerate/cuda_common/fill3D.cu @@ -5,9 +5,7 @@ * - OUT_TYPE: data type for outputs */ -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fill3D( OUT_TYPE* A, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu b/ptypy/accelerate/cuda_common/fill_b.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/fill_b.cu rename to ptypy/accelerate/cuda_common/fill_b.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu b/ptypy/accelerate/cuda_common/fmag_all_update.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu rename to ptypy/accelerate/cuda_common/fmag_all_update.cu index f8f695ca5..42d217a67 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_all_update.cu +++ b/ptypy/accelerate/cuda_common/fmag_all_update.cu @@ -6,10 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -using std::sqrt; -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fmag_all_update(complex* f, const IN_TYPE* fmask, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu b/ptypy/accelerate/cuda_common/fmag_update_nopbound.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu rename to ptypy/accelerate/cuda_common/fmag_update_nopbound.cu index 40a65c172..89e65450b 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fmag_update_nopbound.cu +++ b/ptypy/accelerate/cuda_common/fmag_update_nopbound.cu @@ -6,10 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -using std::sqrt; -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fmag_update_nopbound(complex* f, const IN_TYPE* fmask, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu b/ptypy/accelerate/cuda_common/fourier_deviation.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu rename to ptypy/accelerate/cuda_common/fourier_deviation.cu index 3427222c3..1548094e9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_deviation.cu +++ b/ptypy/accelerate/cuda_common/fourier_deviation.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu b/ptypy/accelerate/cuda_common/fourier_error.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu rename to ptypy/accelerate/cuda_common/fourier_error.cu index ad483c870..43b4e5208 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error.cu +++ b/ptypy/accelerate/cuda_common/fourier_error.cu @@ -7,12 +7,7 @@ */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu b/ptypy/accelerate/cuda_common/fourier_error2.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu rename to ptypy/accelerate/cuda_common/fourier_error2.cu index 86dddf549..36a80c377 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_error2.cu +++ b/ptypy/accelerate/cuda_common/fourier_error2.cu @@ -2,10 +2,7 @@ * the modes. It turned out to run about 2x slower than the one without * shared memory, so it's not used at this stage. */ -#include -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fourier_error2(int nmodes, complex *f, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu b/ptypy/accelerate/cuda_common/fourier_update.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu rename to ptypy/accelerate/cuda_common/fourier_update.cu index a713c4418..5be874424 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/fourier_update.cu +++ b/ptypy/accelerate/cuda_common/fourier_update.cu @@ -6,10 +6,7 @@ is 2x slower than individual as we have many idle threads here. It is not used at the moment. */ -#include -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void fourier_update(int nmodes, complex *f_d, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu b/ptypy/accelerate/cuda_common/full_reduce.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu rename to ptypy/accelerate/cuda_common/full_reduce.cu index 801204aaa..7f53a4b2e 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/full_reduce.cu +++ b/ptypy/accelerate/cuda_common/full_reduce.cu @@ -7,8 +7,6 @@ */ -#include - extern "C" __global__ void full_reduce(const IN_TYPE* in, OUT_TYPE* out, int size) { assert(gridDim.x == 1); diff --git a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu b/ptypy/accelerate/cuda_common/gd_main.cu similarity index 95% rename from ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu rename to ptypy/accelerate/cuda_common/gd_main.cu index 1ab643c4c..461e103ae 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/gd_main.cu +++ b/ptypy/accelerate/cuda_common/gd_main.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void gd_main(const IN_TYPE* Imodel, const IN_TYPE* I, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu b/ptypy/accelerate/cuda_common/get_address.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/get_address.cu rename to ptypy/accelerate/cuda_common/get_address.cu index dda9b45f1..4c42d295b 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/get_address.cu +++ b/ptypy/accelerate/cuda_common/get_address.cu @@ -1,6 +1,4 @@ -#include -#include -using thrust::complex; +#include "common.cuh" inline __device__ int minimum(int a, int b) { return a < b ? a : b; } diff --git a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu b/ptypy/accelerate/cuda_common/intens_renorm.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu rename to ptypy/accelerate/cuda_common/intens_renorm.cu index d0033f7f4..4acd11cf1 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/intens_renorm.cu +++ b/ptypy/accelerate/cuda_common/intens_renorm.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void step1(const IN_TYPE* Imodel, const IN_TYPE* I, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu b/ptypy/accelerate/cuda_common/interpolated_shift.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu rename to ptypy/accelerate/cuda_common/interpolated_shift.cu index 49db445f7..23acce6f9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/interpolated_shift.cu +++ b/ptypy/accelerate/cuda_common/interpolated_shift.cu @@ -1,10 +1,4 @@ -#include -#include -#include -#include -#include -#include -using thrust::complex; +#include "common.cuh" __device__ inline complex& ascomplex(float2& f2) { diff --git a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu b/ptypy/accelerate/cuda_common/log_likelihood.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu rename to ptypy/accelerate/cuda_common/log_likelihood.cu index 075d59f0a..c488f8b69 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/log_likelihood.cu +++ b/ptypy/accelerate/cuda_common/log_likelihood.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu b/ptypy/accelerate/cuda_common/make_a012.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu rename to ptypy/accelerate/cuda_common/make_a012.cu index 11ba29f62..760b28913 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_a012.cu +++ b/ptypy/accelerate/cuda_common/make_a012.cu @@ -7,8 +7,7 @@ * - ACC_TYPE: data type used for accumulation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void make_a012(const complex* f, const complex* a, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu b/ptypy/accelerate/cuda_common/make_aux.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu rename to ptypy/accelerate/cuda_common/make_aux.cu index b2f64ba1d..fde2f7812 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_aux.cu +++ b/ptypy/accelerate/cuda_common/make_aux.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" // core calculation function - used by both kernels and inlined inline __device__ complex calculate( diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu b/ptypy/accelerate/cuda_common/make_exit.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu rename to ptypy/accelerate/cuda_common/make_exit.cu index 956b292dc..e8613da10 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_exit.cu +++ b/ptypy/accelerate/cuda_common/make_exit.cu @@ -7,8 +7,7 @@ */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, complex y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu b/ptypy/accelerate/cuda_common/make_model.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/make_model.cu rename to ptypy/accelerate/cuda_common/make_model.cu index 22bf7d4ab..727388d65 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/make_model.cu +++ b/ptypy/accelerate/cuda_common/make_model.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void make_model( const complex* in, OUT_TYPE* out, int z, int y, int x) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/mass_center.cu b/ptypy/accelerate/cuda_common/mass_center.cu similarity index 100% rename from ptypy/accelerate/cuda_pycuda/cuda/mass_center.cu rename to ptypy/accelerate/cuda_common/mass_center.cu diff --git a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu b/ptypy/accelerate/cuda_common/max_abs2.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu rename to ptypy/accelerate/cuda_common/max_abs2.cu index 4da8efb3e..1780bc268 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/max_abs2.cu +++ b/ptypy/accelerate/cuda_common/max_abs2.cu @@ -5,10 +5,7 @@ * - IN_TYPE: can be float/double or complex/complex */ -#include -#include -using thrust::complex; -using thrust::norm; +#include "common.cuh" inline __device__ OUT_TYPE norm(const float& in) { return in*in; diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu b/ptypy/accelerate/cuda_common/ob_norm_local.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu rename to ptypy/accelerate/cuda_common/ob_norm_local.cu index 3969ea6e9..9d14cae6d 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_norm_local.cu +++ b/ptypy/accelerate/cuda_common/ob_norm_local.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu b/ptypy/accelerate/cuda_common/ob_update.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu rename to ptypy/accelerate/cuda_common/ob_update.cu index 29b993fb0..7bf8dddd9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update.cu +++ b/ptypy/accelerate/cuda_common/ob_update.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu b/ptypy/accelerate/cuda_common/ob_update2.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu rename to ptypy/accelerate/cuda_common/ob_update2.cu index 821c04a6d..1e9717b81 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2.cu +++ b/ptypy/accelerate/cuda_common/ob_update2.cu @@ -15,9 +15,7 @@ */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define ex_dlayer(k) addr[6 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu b/ptypy/accelerate/cuda_common/ob_update2_ML.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu rename to ptypy/accelerate/cuda_common/ob_update2_ML.cu index b62e66006..8840457c0 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update2_ML.cu +++ b/ptypy/accelerate/cuda_common/ob_update2_ML.cu @@ -15,9 +15,7 @@ */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define ex_dlayer(k) addr[6 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu b/ptypy/accelerate/cuda_common/ob_update_ML.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu rename to ptypy/accelerate/cuda_common/ob_update_ML.cu index 84e678ebb..3a20024f9 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_ML.cu +++ b/ptypy/accelerate/cuda_common/ob_update_ML.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu b/ptypy/accelerate/cuda_common/ob_update_local.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu rename to ptypy/accelerate/cuda_common/ob_update_local.cu index b3a955868..9ff5b73e6 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/ob_update_local.cu +++ b/ptypy/accelerate/cuda_common/ob_update_local.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu b/ptypy/accelerate/cuda_common/pr_norm_local.cu similarity index 93% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu rename to ptypy/accelerate/cuda_common/pr_norm_local.cu index 6e9a8ea76..a89e8f842 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_norm_local.cu +++ b/ptypy/accelerate/cuda_common/pr_norm_local.cu @@ -6,12 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -#include -#include -using std::sqrt; -using thrust::abs; -using thrust::complex; +#include "common.cuh" // specify max number of threads/block and min number of blocks per SM, // to assist the compiler in register optimisations. diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu b/ptypy/accelerate/cuda_common/pr_update.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu rename to ptypy/accelerate/cuda_common/pr_update.cu index 180cf8f14..d7739a569 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update.cu +++ b/ptypy/accelerate/cuda_common/pr_update.cu @@ -6,8 +6,7 @@ * - MATH_TYPE: the data type used for computation */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu b/ptypy/accelerate/cuda_common/pr_update2.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu rename to ptypy/accelerate/cuda_common/pr_update2.cu index e5417cc01..09913bc02 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2.cu +++ b/ptypy/accelerate/cuda_common/pr_update2.cu @@ -14,9 +14,7 @@ * and the kernel will get considerably slower. */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define pr_roi_row(k) addr[1 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu b/ptypy/accelerate/cuda_common/pr_update2_ML.cu similarity index 98% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu rename to ptypy/accelerate/cuda_common/pr_update2_ML.cu index 8a45891c5..167610ea6 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update2_ML.cu +++ b/ptypy/accelerate/cuda_common/pr_update2_ML.cu @@ -14,9 +14,7 @@ * and the kernel will get considerably slower. */ -#include -#include -using thrust::complex; +#include "common.cuh" #define pr_dlayer(k) addr[(k)] #define pr_roi_row(k) addr[1 * num_pods + (k)] diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu b/ptypy/accelerate/cuda_common/pr_update_ML.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu rename to ptypy/accelerate/cuda_common/pr_update_ML.cu index 3fa24137d..ad32dfe8a 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_ML.cu +++ b/ptypy/accelerate/cuda_common/pr_update_ML.cu @@ -7,8 +7,7 @@ */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu b/ptypy/accelerate/cuda_common/pr_update_local.cu similarity index 97% rename from ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu rename to ptypy/accelerate/cuda_common/pr_update_local.cu index d515afd55..cf221aadd 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/pr_update_local.cu +++ b/ptypy/accelerate/cuda_common/pr_update_local.cu @@ -7,8 +7,7 @@ * - ACC_TYPE: data type used in norm calculation (input here) */ -#include -using thrust::complex; +#include "common.cuh" template __device__ inline void atomicAdd(complex* x, const complex& y) diff --git a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu b/ptypy/accelerate/cuda_common/transpose.cu similarity index 96% rename from ptypy/accelerate/cuda_pycuda/cuda/transpose.cu rename to ptypy/accelerate/cuda_common/transpose.cu index 8de4e7ad7..f00be8937 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/transpose.cu +++ b/ptypy/accelerate/cuda_common/transpose.cu @@ -10,8 +10,7 @@ * - DTYPE - any pod type */ -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void transpose(const DTYPE* idata, DTYPE* odata, diff --git a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu b/ptypy/accelerate/cuda_common/update_addr_error_state.cu similarity index 94% rename from ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu rename to ptypy/accelerate/cuda_common/update_addr_error_state.cu index 1220a0986..e4045a38b 100644 --- a/ptypy/accelerate/cuda_pycuda/cuda/update_addr_error_state.cu +++ b/ptypy/accelerate/cuda_common/update_addr_error_state.cu @@ -5,9 +5,7 @@ * - OUT_TYPE: the data type for the outputs (float or double) */ -#include -#include -using thrust::complex; +#include "common.cuh" extern "C" __global__ void update_addr_error_state(int* __restrict addr, const int* __restrict mangled_addr, diff --git a/ptypy/accelerate/cuda_common/utils.py b/ptypy/accelerate/cuda_common/utils.py new file mode 100644 index 000000000..a953bfdb4 --- /dev/null +++ b/ptypy/accelerate/cuda_common/utils.py @@ -0,0 +1,18 @@ +import numpy as np + +# maps a numpy dtype to the corresponding C type +def map2ctype(dt): + if dt == np.float32: + return 'float' + elif dt == np.float64: + return 'double' + elif dt == np.complex64: + return 'complex' + elif dt == np.complex128: + return 'complex' + elif dt == np.int32: + return 'int' + elif dt == np.int64: + return 'long long' + else: + raise ValueError('No mapping for {}'.format(dt)) diff --git a/ptypy/accelerate/cuda_cupy/__init__.py b/ptypy/accelerate/cuda_cupy/__init__.py new file mode 100644 index 000000000..717878241 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/__init__.py @@ -0,0 +1,73 @@ + +from typing import Optional +import cupy as cp +import os + +from ptypy.utils.verbose import headerline, log + +kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common')) +compile_options =['-std=c++14', '-DPTYPY_CUPY_NVTRC=1', '-I' + kernel_dir, '-DNDEBUG'] +queue = None +device = None + + +def get_context(new_queue=False): + + from ptypy.utils import parallel + + global queue, device + + if queue is None or new_queue: + ndevs = cp.cuda.runtime.getDeviceCount() + if parallel.rank_local >= ndevs: + raise Exception('Local rank must be smaller than total device count, \ + rank={}, rank_local={}, device_count={}'.format( + parallel.rank, parallel.rank_local, ndevs + )) + device = cp.cuda.Device(parallel.rank_local) + device.use() + queue = cp.cuda.Stream() + + return queue + + +def load_kernel(name, subs={}, file=None, options=None): + + if file is None: + if isinstance(name, str): + fn = "%s/%s.cu" % (kernel_dir, name) + else: + raise ValueError( + "name parameter must be a string if not filename is given") + else: + fn = "%s/%s" % (kernel_dir, file) + + with open(fn, 'r') as f: + kernel = f.read() + for k, v in list(subs.items()): + kernel = kernel.replace(k, str(v)) + # insert a preprocessor line directive to assist compiler errors + escaped = fn.replace("\\", "\\\\") + kernel = '#line 1 "{}"\n'.format(escaped) + kernel + + opt = [*compile_options] + if options is not None: + opt += list(options) + module = cp.RawModule(code=kernel, options=tuple(opt)) + if isinstance(name, str): + return module.get_function(name) + else: # tuple + return tuple(module.get_function(n) for n in name) + +def log_device_memory_stats(level=4, heading: str ='Device Memory Stats'): + mempool = cp.get_default_memory_pool() + pinned_pool = cp.get_default_pinned_memory_pool() + log(level, '\n' + headerline(heading)) + log(level, f'Device id : {cp.cuda.Device().id}') + log(level, f'Total device mem : {cp.cuda.runtime.memGetInfo()[1]/1024/1024} MB') + log(level, f'Free device mem : {cp.cuda.runtime.memGetInfo()[0]/1024/1024} MB') + log(level, f'MemoryPool size : {mempool.total_bytes()/1024/1024} MB') + log(level, f'MemoryPool used : {mempool.used_bytes()/1024/1024} MB') + log(level, f'MemoryPool limit : {mempool.get_limit()/1024/1024} MB') + log(level, f'MemoryPool free blocks: {mempool.n_free_blocks()}') + log(level, f'PinnedPool free blocks: {pinned_pool.n_free_blocks()}') \ No newline at end of file diff --git a/ptypy/accelerate/cuda_cupy/address_manglers.py b/ptypy/accelerate/cuda_cupy/address_manglers.py new file mode 100644 index 000000000..ae4eeadbe --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/address_manglers.py @@ -0,0 +1,81 @@ +from . import load_kernel +import numpy as np +from ptypy.accelerate.base import address_manglers as npam +import cupy as cp + + +class BaseMangler(npam.BaseMangler): + + def __init__(self, *args, queue_thread=None, **kwargs): + super().__init__(*args, **kwargs) + self.queue = queue_thread + self.get_address_cuda = load_kernel("get_address") + self.delta = None + self.delta_gpu = None + + def _setup_delta_gpu(self): + if self.queue is not None: + self.queue.use() + assert self.delta is not None, "Setup delta using the setup_shifts method first" + self.delta = np.ascontiguousarray(self.delta, dtype=np.int32) + + if self.delta_gpu is None or self.delta_gpu.shape[0] < self.delta.shape[0]: + self.delta_gpu = cp.empty(self.delta.shape, dtype=np.int32) + # in case self.delta is smaller than delta_gpu, this will only copy the + # relevant part + cp.cuda.runtime.memcpy(dst=self.delta_gpu.data.ptr, + src=self.delta.ctypes.data, + size=self.delta.size * self.delta.itemsize, + kind=1) # host to device + + + def get_address(self, index, addr_current, mangled_addr, max_oby, max_obx): + assert addr_current.dtype == np.int32, "addresses must be int32" + assert mangled_addr.dtype == np.int32, "addresses must be int32" + assert len(addr_current.shape) == 4, "addresses must be 4 dimensions" + assert addr_current.shape == mangled_addr.shape, "output addresses must be pre-allocated" + assert self.delta_gpu is not None, "Deltas are not set yet - call setup_shifts first" + assert index < self.delta_gpu.shape[0], "Index out of range for deltas" + assert isinstance(self.delta_gpu, cp.ndarray), "Only GPU arrays are supported for delta" + + if self.queue is not None: + self.queue.use() + + # only using a single thread block here as it's not enough work + # otherwise + self.get_address_cuda( + (1, 1, 1), + (64, 1, 1), + (addr_current, + mangled_addr, + np.int32(addr_current.shape[0] * addr_current.shape[1]), + self.delta_gpu[index,None], + np.int32(max_oby), + np.int32(max_obx))) + +# with multiple inheritance, we have to be explicit which super class +# we are calling in the methods +class RandomIntMangler(BaseMangler, npam.RandomIntMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.RandomIntMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) + + +class GridSearchMangler(BaseMangler, npam.GridSearchMangler): + + def __init__(self, *args, **kwargs): + BaseMangler.__init__(self, *args, **kwargs) + + def setup_shifts(self, *args, **kwargs): + npam.GridSearchMangler.setup_shifts(self, *args, **kwargs) + self._setup_delta_gpu() + + def get_address(self, *args, **kwargs): + BaseMangler.get_address(self, *args, **kwargs) \ No newline at end of file diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py new file mode 100644 index 000000000..911c6111d --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/array_utils.py @@ -0,0 +1,670 @@ +import cupy as cp +import numpy as np + +from ptypy.accelerate.cuda_common.utils import map2ctype +from ptypy.utils.math_utils import gaussian +from . import load_kernel + + +class ArrayUtilsKernel: + def __init__(self, acc_dtype=cp.float64, queue=None): + self.queue = queue + self.acc_dtype = acc_dtype + # Note: cupy's ReductionKernel is far less efficient + self.cdot_cuda = load_kernel("dot", { + 'IN_TYPE': 'complex', + 'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float' + }) + self.dot_cuda = load_kernel("dot", { + 'IN_TYPE': 'float', + 'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float' + }) + self.full_reduce_cuda = load_kernel("full_reduce", { + 'IN_TYPE': 'double' if acc_dtype == np.float64 else 'float', + 'OUT_TYPE': 'double' if acc_dtype == np.float64 else 'float', + 'ACC_TYPE': 'double' if acc_dtype == np.float64 else 'float', + 'BDIM_X': 1024 + }) + self.Ctmp = None + + def dot(self, A: cp.ndarray, B: cp.ndarray, out: cp.ndarray = None) -> cp.ndarray: + assert A.dtype == B.dtype, "Input arrays must be of same data type" + assert A.size == B.size, "Input arrays must be of the same size" + + if self.queue is not None: + self.queue.use() + if out is None: + out = cp.empty(1, dtype=self.acc_dtype) + + block = (1024, 1, 1) + grid = (int((B.size + 1023) // 1024), 1, 1) + if self.acc_dtype == np.float32: + elsize = 4 + elif self.acc_dtype == np.float64: + elsize = 8 + if self.Ctmp is None or self.Ctmp.size < grid[0]: + self.Ctmp = cp.zeros((grid[0],), dtype=self.acc_dtype) + Ctmp = self.Ctmp + if grid[0] == 1: + Ctmp = out + if np.iscomplexobj(B): + self.cdot_cuda(grid, block, (A, B, np.int32(A.size), Ctmp), + shared_mem=1024 * elsize) + else: + self.dot_cuda(grid, block, (A, B, np.int32(A.size), Ctmp), + shared_mem=1024 * elsize) + if grid[0] > 1: + self.full_reduce_cuda((1, 1, 1), (1024, 1, 1), (self.Ctmp, out, np.int32(grid[0])), + shared_mem=elsize*1024) + + return out + + def norm2(self, A, out=None): + return self.dot(A, A, out) + + +class TransposeKernel: + + def __init__(self, queue=None): + self.queue = queue + self.transpose_cuda = load_kernel("transpose", { + 'DTYPE': 'int', + 'BDIM': 16 + }) + + def transpose(self, input, output): + # only for int at the moment (addr array), and 2D (reshape pls) + if len(input.shape) != 2: + raise ValueError( + "Only 2D tranpose is supported - reshape as desired") + if input.shape[0] != output.shape[1] or input.shape[1] != output.shape[0]: + raise ValueError("Input/Output must be of flipped shape") + if input.dtype != np.int32 or output.dtype != np.int32: + raise ValueError("Only int types are supported at the moment") + + width = input.shape[1] + height = input.shape[0] + blk = (16, 16, 1) + grd = ( + int((input.shape[1] + 15) // 16), + int((input.shape[0] + 15) // 16), + 1 + ) + if self.queue is not None: + self.queue.use() + self.transpose_cuda( + grd, blk, (input, output, np.int32(width), np.int32(height))) + + +class MaxAbs2Kernel: + + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.max_abs2_cuda = {} + + def max_abs2(self, X: cp.ndarray, out: cp.ndarray): + """ Calculate max(abs(x)**2) across the final 2 dimensions""" + rows = np.int32(X.shape[-2]) + cols = np.int32(X.shape[-1]) + firstdims = np.int32(np.prod(X.shape[:-2])) + gy = int(rows) + # lazy-loading, keeping scratch memory and both kernels in the same dictionary + bx = int(64) + version = '{},{},{}'.format( + map2ctype(X.dtype), map2ctype(out.dtype), gy) + if version not in self.max_abs2_cuda: + step1, step2 = load_kernel( + ("max_abs2_step1", "max_abs2_step2"), + { + 'IN_TYPE': map2ctype(X.dtype), + 'OUT_TYPE': map2ctype(out.dtype), + 'BDIM_X': bx, + }, "max_abs2.cu") + self.max_abs2_cuda[version] = { + 'step1': step1, + 'step2': step2, + 'scratchmem': cp.empty((gy,), dtype=out.dtype) + } + + # if self.max_abs2_cuda[version]['scratchmem'] is None \ + # or self.max_abs2_cuda[version]['scratchmem'].shape[0] != gy: + # self.max_abs2_cuda[version]['scratchmem'] = + scratch = self.max_abs2_cuda[version]['scratchmem'] + + if self.queue is not None: + self.queue.use() + + self.max_abs2_cuda[version]['step1']( + (1, gy, 1), (bx, 1, 1), (X, firstdims, rows, cols, scratch)) + self.max_abs2_cuda[version]['step2']( + (1, 1, 1), (bx, 1, 1), (scratch, np.int32(gy), out)) + + +class CropPadKernel: + + def __init__(self, queue=None): + self.queue = queue + # we lazy-load this depending on the data types we get + self.fill3D_cuda = {} + + def fill3D(self, A, B, offset=[0, 0, 0]): + """ + Fill 3-dimensional array A with B. + """ + if A.ndim < 3 or B.ndim < 3: + raise ValueError('Input arrays must each be at least 3D') + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + ash = A.shape + bsh = B.shape + misfit = np.array(bsh) - np.array(ash) + assert not misfit[:-3].any( + ), "Input and Output must have the same shape everywhere but the last three axes." + + Alim = np.array(A.shape[-3:]) + Blim = np.array(B.shape[-3:]) + off = np.array(offset) + Ao = off.copy() + Ao[Ao < 0] = 0 + Bo = -off.copy() + Bo[Bo < 0] = 0 + assert (Bo < Blim).all() and (Ao < Alim).all( + ), "At least one dimension lacks overlap" + Ao = Ao.astype(np.int32) + Bo = Bo.astype(np.int32) + lengths = np.array([ + min(off[0] + Blim[0], Alim[0]) - Ao[0], + min(off[1] + Blim[1], Alim[1]) - Ao[1], + min(off[2] + Blim[2], Alim[2]) - Ao[2], + ], dtype=np.int32) + lengths2 = np.array([ + min(Alim[0] - off[0], Blim[0]) - Bo[0], + min(Alim[1] - off[1], Blim[1]) - Bo[1], + min(Alim[2] - off[2], Blim[2]) - Bo[2], + ], dtype=np.int32) + assert (lengths == lengths2).all( + ), "left and right lenghts are not matching" + batch = int(np.prod(A.shape[:-3])) + + # lazy loading depending on data type + version = '{},{}'.format(map2ctype(B.dtype), map2ctype(A.dtype)) + if version not in self.fill3D_cuda: + self.fill3D_cuda[version] = load_kernel("fill3D", { + 'IN_TYPE': map2ctype(B.dtype), + 'OUT_TYPE': map2ctype(A.dtype) + }) + bx = by = 32 + if self.queue is not None: + self.queue.use() + self.fill3D_cuda[version]( + (int((lengths[2] + bx - 1)//bx), + int((lengths[1] + by - 1)//by), + int(batch)), + (int(bx), int(by), int(1)), + (A, B, + np.int32(A.shape[-3]), np.int32(A.shape[-2] + ), np.int32(A.shape[-1]), + np.int32(B.shape[-3]), np.int32(B.shape[-2] + ), np.int32(B.shape[-1]), + Ao[0], Ao[1], Ao[2], + Bo[0], Bo[1], Bo[2], + lengths[0], lengths[1], lengths[2]) + ) + + def crop_pad_2d_simple(self, A, B): + """ + Places B in A centered around the last two axis. A and B must be of the same shape + anywhere but the last two dims. + """ + assert A.ndim >= 2, "Arrays must have more than 2 dimensions." + assert A.ndim == B.ndim, "Input and Output must have the same number of dimensions." + misfit = np.array(A.shape) - np.array(B.shape) + assert not misfit[:-2].any( + ), "Input and Output must have the same shape everywhere but the last two axes." + if A.ndim == 2: + A = A.reshape((1,) + A.shape) + if B.ndim == 2: + B = B.reshape((1,) + B.shape) + a1, a2 = A.shape[-2:] + b1, b2 = B.shape[-2:] + offset = [0, a1 // 2 - b1 // 2, a2 // 2 - b2 // 2] + self.fill3D(A, B, offset) + + +class DerivativesKernel: + def __init__(self, dtype, queue=None): + if dtype == np.float32: + stype = "float" + elif dtype == np.complex64: + stype = "complex" + else: + raise NotImplementedError( + "delxf is only implemented for float32 and complex64") + + self.queue = queue + self.dtype = dtype + self.last_axis_block = (256, 4, 1) + self.mid_axis_block = (256, 4, 1) + + self.delxf_last, self.delxf_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'true', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + self.delxb_last, self.delxb_mid = load_kernel( + ("delx_last", "delx_mid"), + file="delx.cu", + subs={ + 'IS_FORWARD': 'false', + 'BDIM_X': str(self.last_axis_block[0]), + 'BDIM_Y': str(self.last_axis_block[1]), + 'IN_TYPE': stype, + 'OUT_TYPE': stype + }) + + def delxf(self, input, out, axis=-1): + if input.dtype != self.dtype: + raise ValueError('Invalid input data type') + + if axis < 0: + axis = input.ndim + axis + axis = np.int32(axis) + + if self.queue is not None: + self.queue.use() + + if axis == input.ndim - 1: + flat_dim = np.int32(np.product(input.shape[0:-1])) + self.delxf_last(( + int((flat_dim + + self.last_axis_block[1] - 1) // self.last_axis_block[1]), + 1, 1), + self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis]))) + else: + lower_dim = np.int32(np.product(input.shape[(axis+1):])) + higher_dim = np.int32(np.product(input.shape[:axis])) + gx = int( + (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) + gy = 1 + gz = int(higher_dim) + self.delxf_mid((gx, gy, gz), self.mid_axis_block, (input, + out, lower_dim, higher_dim, np.int32(input.shape[axis]))) + + def delxb(self, input, out, axis=-1): + if input.dtype != self.dtype: + raise ValueError('Invalid input data type') + + if axis < 0: + axis = input.ndim + axis + axis = np.int32(axis) + + if self.queue is not None: + self.queue.use() + if axis == input.ndim - 1: + flat_dim = np.int32(np.product(input.shape[0:-1])) + self.delxb_last(( + int((flat_dim + + self.last_axis_block[1] - 1) // self.last_axis_block[1]), + 1, 1), self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis]))) + else: + lower_dim = np.int32(np.product(input.shape[(axis+1):])) + higher_dim = np.int32(np.product(input.shape[:axis])) + gx = int( + (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) + gy = 1 + gz = int(higher_dim) + self.delxb_mid((gx, gy, gz), self.mid_axis_block, (input, + out, lower_dim, higher_dim, np.int32(input.shape[axis]))) + + +class GaussianSmoothingKernel: + def __init__(self, queue=None, num_stdevs=4, kernel_type='float'): + if kernel_type not in ['float', 'double']: + raise ValueError('Invalid data type for kernel') + self.kernel_type = kernel_type + self.dtype = np.complex64 + self.stype = "complex" + self.queue = queue + self.num_stdevs = num_stdevs + self.blockdim_x = 4 + self.blockdim_y = 16 + + # At least 2 blocks per SM + self.max_shared_per_block = 48 * 1024 // 2 + self.max_shared_per_block_complex = self.max_shared_per_block / \ + 2 * np.dtype(np.float32).itemsize + self.max_kernel_radius = int( + self.max_shared_per_block_complex / self.blockdim_y) + + self.convolution_row = load_kernel( + "convolution_row", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_x, + 'BDIM_Y': self.blockdim_y, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type + }) + self.convolution_col = load_kernel( + "convolution_col", file="convolution.cu", subs={ + 'BDIM_X': self.blockdim_y, # NOTE: we swap x and y in this columns + 'BDIM_Y': self.blockdim_x, + 'DTYPE': self.stype, + 'MATH_TYPE': self.kernel_type + }) + # pre-allocate kernel memory on gpu, with max-radius to accomodate + dtype = np.float32 if self.kernel_type == 'float' else np.float64 + self.kernel_gpu = cp.empty((self.max_kernel_radius,), dtype=dtype) + # keep track of previus radius and std to determine if we need to transfer again + self.r = 0 + self.std = 0 + + def convolution(self, data, mfs, tmp=None): + """ + Calculates a stacked 2D convolution for smoothing, with the standard deviations + given in mfs (stdx, stdy). It works in-place in the data array, + and tmp is a gpu-allocated array of the same size and type as data, + used internally for temporary storage + """ + ndims = data.ndim + shape = data.shape + + # Create temporary array (if not given) + if tmp is None: + tmp = cp.empty(shape, dtype=data.dtype) + assert shape == tmp.shape and data.dtype == tmp.dtype + + # Check input dimensions + if ndims == 3: + batches, y, x = shape + stdy, stdx = mfs + elif ndims == 2: + batches = 1 + y, x = shape + stdy, stdx = mfs + elif ndims == 1: + batches = 1 + y, x = shape[0], 1 + stdy, stdx = mfs[0], 0.0 + else: + raise NotImplementedError( + "input needs to be of dimensions 0 < ndims <= 3") + + input = data + output = tmp + + if self.queue is not None: + self.queue.use() + + # Row convolution kernel + # TODO: is this threshold acceptable in all cases? + if stdx > 0.1: + r = int(self.num_stdevs * stdx + 0.5) + if r > self.max_kernel_radius: + raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdx != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r, r+1), stdx) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype( + np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = cp.asarray(k[:]) + self.r = r + self.std = stdx + + bx = self.blockdim_x + by = self.blockdim_y + + shared = (bx + 2*r) * by * np.dtype(np.complex64).itemsize + if shared > self.max_shared_per_block: + raise MemoryError("Cannot run kernel in shared memory") + + blk = (bx, by, 1) + grd = (int((y + bx - 1) // bx), int((x + by-1) // by), batches) + self.convolution_row(grd, blk, (input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r)), + shared_mem=shared) + + input = output + output = data + + # Column convolution kernel + # TODO: is this threshold acceptable in all cases? + if stdy > 0.1: + r = int(self.num_stdevs * stdy + 0.5) + if r > self.max_kernel_radius: + raise ValueError("Size of Gaussian kernel too large") + if r != self.r or stdy != self.std: + # recalculate + transfer + g = gaussian(np.arange(-r, r+1), stdy) + g /= g.sum() + k = np.ascontiguousarray(g[r:].astype( + np.float32 if self.kernel_type == 'float' else np.float64)) + self.kernel_gpu[:r+1] = cp.asarray(k[:]) + self.r = r + self.std = stdy + + bx = self.blockdim_y + by = self.blockdim_x + + shared = (by + 2*r) * bx * np.dtype(np.complex64).itemsize + if shared > self.max_shared_per_block: + raise MemoryError("Cannot run kernel in shared memory") + + blk = (bx, by, 1) + grd = (int((y + bx - 1) // bx), int((x + by-1) // by), batches) + self.convolution_col(grd, blk, (input, output, np.int32(y), np.int32(x), self.kernel_gpu, np.int32(r)), + shared_mem=shared) + + # TODO: is this threshold acceptable in all cases? + if (stdx <= 0.1 and stdy <= 0.1): + return # nothing to do + elif (stdx > 0.1 and stdy > 0.1): + return # both parts have run, output is back in data + else: + data[:] = tmp[:] # only one of them has run, output is in tmp + + +class ClipMagnitudesKernel: + + def __init__(self, queue=None): + self.queue = queue + self.clip_magnitudes_cuda = load_kernel("clip_magnitudes", { + 'IN_TYPE': 'complex', + }) + + def clip_magnitudes_to_range(self, array, clip_min, clip_max): + if self.queue is not None: + self.queue.use() + + cmin = np.float32(clip_min) + cmax = np.float32(clip_max) + + npixel = np.int32(np.prod(array.shape)) + bx = 256 + gx = int((npixel + bx - 1) // bx) + self.clip_magnitudes_cuda((gx, 1, 1), (bx, 1, 1), (array, cmin, cmax, + npixel)) + +class MassCenterKernel: + + def __init__(self, queue=None): + self.queue = queue + self.threadsPerBlock = 256 + + self.indexed_sum_middim_cuda = load_kernel("indexed_sum_middim", + file="mass_center.cu", subs={ + 'IN_TYPE': 'float', + 'BDIM_X' : self.threadsPerBlock, + 'BDIM_Y' : 1, + } + ) + + self.indexed_sum_lastdim_cuda = load_kernel("indexed_sum_lastdim", + file="mass_center.cu", subs={ + 'IN_TYPE': 'float', + 'BDIM_X' : 32, + 'BDIM_Y' : 32, + } + ) + + self.final_sums_cuda = load_kernel("final_sums", + file="mass_center.cu", subs={ + 'IN_TYPE': 'float', + 'BDIM_X' : 256, + 'BDIM_Y' : 1, + } + ) + + def mass_center(self, array): + if array.dtype != np.float32: + raise NotImplementedError("mass_center is only implemented for float32") + + i = np.int32(array.shape[0]) + m = np.int32(array.shape[1]) + if array.ndim >= 3: + n = np.int32(array.shape[2]) + else: + n = np.int32(1) + + if self.queue is not None: + self.queue.use() + + total_sum = cp.sum(array, dtype=np.float32).get() + sc = np.float32(1. / total_sum.item()) + + i_sum = cp.empty(array.shape[0], dtype=np.float32) + m_sum = cp.empty(array.shape[1], dtype=np.float32) + n_sum = cp.empty(int(n), dtype=np.float32) + out = cp.empty(3 if n>1 else 2, dtype=np.float32) + + # sum all dims except the first, multiplying by the index and scaling factor + block_ = (self.threadsPerBlock, 1, 1) + grid_ = (int(i), 1, 1) + self.indexed_sum_middim_cuda(grid_, block_, (array, i_sum, np.int32(1), i, n*m, sc), + shared_mem=self.threadsPerBlock*4) + + if array.ndim >= 3: + # 3d case + # sum all dims, except the middle, multiplying by the index and scaling factor + block_ = (self.threadsPerBlock, 1, 1) + grid_ = (int(m), 1, 1) + self.indexed_sum_middim_cuda(grid_, block_, (array, m_sum, i, n, m, sc), + shared_mem=self.threadsPerBlock*4) + + # sum the all dims except the last, multiplying by the index and scaling factor + block_ = (32, 32, 1) + grid_ = (1, int(n + 32 - 1) // 32, 1) + self.indexed_sum_lastdim_cuda(grid_, block_, (array, n_sum, i*m, n, sc), + shared_mem=32*32*4) + else: + # 2d case + # sum the all dims except the last, multiplying by the index and scaling factor + block_ = (32, 32, 1) + grid_ = (1, int(m + 32 - 1) // 32, 1) + self.indexed_sum_lastdim_cuda(grid_, block_, (array, m_sum, i, m, sc), + shared_mem=32*32*4) + + block_ = (256, 1, 1) + grid_ = (3 if n>1 else 2, 1, 1) + self.final_sums_cuda(grid_, block_, (i_sum, i, m_sum, m, n_sum, n, out), + shared_mem=256*4) + + return out + +class Abs2SumKernel: + + def __init__(self, dtype, queue=None): + self.in_stype = map2ctype(dtype) + if self.in_stype == 'complex': + self.out_stype = 'float' + self.out_dtype = np.float32 + elif self.in_stype == 'copmlex': + self.out_stype = 'double' + self.out_dtype = np.float64 + else: + self.out_stype = self.in_stype + self.out_dtype = dtype + + self.queue = queue + self.threadsPerBlock = 32 + + self.abs2sum_cuda = load_kernel("abs2sum", subs={ + 'IN_TYPE': self.in_stype, + 'OUT_TYPE' : self.out_stype, + 'BDIM_X' : 32, + } + ) + + def abs2sum(self, array): + nmodes = np.int32(array.shape[0]) + row, col = array.shape[1:] + out = cp.empty(array.shape[1:], dtype=self.out_dtype) + + if self.queue is not None: + self.queue.use() + block_ = (32, 1, 1) + grid_ = (1, row, 1) + self.abs2sum_cuda(grid_, block_, (array, nmodes, np.int32(row), np.int32(col), out)) + + return out + +class InterpolatedShiftKernel: + + def __init__(self, queue=None): + self.queue = queue + + self.integer_shift_cuda, self.linear_interpolate_cuda = load_kernel( + ("integer_shift_kernel", "linear_interpolate_kernel"), + file="interpolated_shift.cu", subs={ + 'IN_TYPE': 'complex', + 'OUT_TYPE': 'complex', + 'BDIM_X' : 32, + 'BDIM_Y' : 32, + } + ) + + def interpolate_shift(self, array, shift): + shift = np.asarray(shift, dtype=np.float32) + if len(shift) != 2: + raise NotImplementedError("Shift only applied to 2D array.") + if array.dtype != np.complex64: + raise NotImplementedError("Only complex single precision supported") + if array.ndim == 3: + items, rows, columns = array.shape + elif array.ndim == 2: + items, rows, columns = 1, *array.shape + else: + raise NotImplementedError("Only 2- or 3-dimensional arrays supported") + + offsetRow, offsetCol = shift + + offsetRowFrac, offsetRowInt = np.modf(offsetRow) + offsetColFrac, offsetColInt = np.modf(offsetCol) + + if self.queue is not None: + self.queue.use() + + out = cp.empty_like(array) + block_ = (32, 32, 1) + grid_ = ((rows + 31) // 32, (columns + 31) // 32, items) + + if np.abs(offsetRowFrac) < 1e-6 and np.abs(offsetColFrac) < 1e-6: + if offsetRowInt == 0 and offsetColInt == 0: + # no transformation at all + out = array + else: + # no fractional part, so we can just use a shifted copy + self.integer_shift_cuda(grid_, block_, (array, out, np.int32(rows), + np.int32(columns), np.int32(offsetRow), + np.int32(offsetCol))) + else: + self.linear_interpolate_cuda(grid_, block_, (array, out, np.int32(rows), + np.int32(columns), np.float32(offsetRow), + np.float32(offsetCol)), + shared_mem=(32+2)**2*8+32*(32+2)*8) + + return out + diff --git a/ptypy/accelerate/cuda_cupy/cufft.py b/ptypy/accelerate/cuda_cupy/cufft.py new file mode 100644 index 000000000..794efb858 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/cufft.py @@ -0,0 +1,171 @@ +import cupy as cp +from cupyx.scipy import fft as cuxfft +from cupyx.scipy.fft import get_fft_plan +from . import load_kernel +import numpy as np + + +class FFT_cuda(object): + + def __init__(self, array, queue=None, + inplace=False, + pre_fft=None, + post_fft=None, + symmetric=True, + forward=True): + self._queue = queue + dims = array.ndim + if dims < 2: + raise AssertionError('Input array must be at least 2-dimensional') + self.arr_shape = (array.shape[-2], array.shape[-1]) + rows = self.arr_shape[0] + columns = self.arr_shape[1] + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + raise ValueError( + "CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") + self.batches = int(np.product( + array.shape[0:dims-2]) if dims > 2 else 1) + self.forward = forward + + self._load(array, pre_fft, post_fft, symmetric, forward) + + def _load(self, array, pre_fft, post_fft, symmetric, forward): + if pre_fft is not None: + self.pre_fft = cp.asarray(pre_fft) + self.pre_fft_ptr = self.pre_fft.data.ptr + else: + self.pre_fft_ptr = 0 + if post_fft is not None: + self.post_fft = cp.asarray(post_fft) + self.post_fft_ptr = self.post_fft.data.ptr + else: + self.post_fft_ptr = 0 + + import filtered_cufft + self.fftobj = filtered_cufft.FilteredFFT( + self.batches, + self.arr_shape[0], + self.arr_shape[1], + symmetric, + forward, + self.pre_fft_ptr, + self.post_fft_ptr, + self._queue.ptr) + + self.ft = self._ft + self.ift = self._ift + + @property + def queue(self): + return self._queue + + @queue.setter + def queue(self, queue): + self._queue = queue + self.fftobj.queue = self._queue.ptr + + def _ft(self, input, output): + self.fftobj.fft(input.data.ptr, output.data.ptr) + + def _ift(self, input, output): + self.fftobj.ifft(input.data.ptr, output.data.ptr) + + +class FFT_cupy(FFT_cuda): + + @property + def queue(self): + return self._queue + + @queue.setter + def queue(self, queue): + self._queue = queue + + def _load(self, array, pre_fft, post_fft, symmetric, forward): + assert (array.dtype in [np.complex64, np.complex128]) + assert (pre_fft.dtype in [ + np.complex64, np.complex128] if pre_fft is not None else True) + assert (post_fft.dtype in [ + np.complex64, np.complex128] if post_fft is not None else True) + + math_type = 'float' if array.dtype == np.complex64 else 'double' + if pre_fft is not None: + math_type = 'float' if pre_fft.dtype == np.complex64 else 'double' + self.pre_fft_knl = load_kernel("batched_multiply", { + 'MPY_DO_SCALE': 'false', + 'MPY_DO_FILT': 'true', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type + }) if pre_fft is not None else None + + math_type = 'float' if array.dtype == np.complex64 else 'double' + if post_fft is not None: + math_type = 'float' if post_fft.dtype == np.complex64 else 'double' + self.post_fft_knl = load_kernel("batched_multiply", { + 'MPY_DO_SCALE': 'true' if (not forward and not symmetric) or symmetric else 'false', + 'MPY_DO_FILT': 'true' if post_fft is not None else 'false', + 'IN_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype == np.complex64 else 'double', + 'MATH_TYPE': math_type + }) if (not (forward and not symmetric) or post_fft is not None) else None + + self.block = (32, 32, 1) + self.grid = ( + int((self.arr_shape[0] + 31) // 32), + int((self.arr_shape[1] + 31) // 32), + int(self.batches) + ) + if self.queue is not None: + self.queue.use() + self.plan = get_fft_plan(array, self.arr_shape, axes=(-2, -1), value_type="C2C") + self.scale = 1.0 + self.norm = 'ortho' if symmetric else 'backward' + + if pre_fft is not None: + self.pre_fft = cp.asarray(pre_fft) + else: + self.pre_fft = np.intp(0) # NULL + if post_fft is not None: + self.post_fft = cp.asarray(post_fft) + else: + self.post_fft = np.intp(0) + + self.ft = self._ft + self.ift = self._ift + + def _prefilt(self, x, y): + if self.pre_fft_knl: + self.pre_fft_knl(grid=self.grid, + block=self.block, + args=(x, y, self.pre_fft, + np.float32(self.scale), + np.int32(self.batches), + np.int32(self.arr_shape[0]), + np.int32(self.arr_shape[1]))) + else: + y[:] = x[:] + + def _postfilt(self, y): + if self.post_fft_knl: + assert self.post_fft is not None + assert self.scale is not None + self.post_fft_knl(grid=self.grid, + block=self.block, + args=(y, y, self.post_fft, np.float32(self.scale), + np.int32(self.batches), + np.int32(self.arr_shape[0]), + np.int32(self.arr_shape[1]))) + def _ft(self, x, y): + if self.queue is not None: + self.queue.use() + self._prefilt(x, y) + cuxfft.fft2(y, axes=(-2, -1), plan=self.plan, overwrite_x=True, norm=self.norm) + self._postfilt(y) + + def _ift(self, x, y): + if self.queue is not None: + self.queue.use() + self._prefilt(x, y) + cuxfft.ifft2(y, axes=(-2, -1), plan=self.plan, overwrite_x=True, norm=self.norm) + self._postfilt(y) diff --git a/ptypy/accelerate/cuda_cupy/dependencies.yml b/ptypy/accelerate/cuda_cupy/dependencies.yml new file mode 100644 index 000000000..cb7d31fce --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/dependencies.yml @@ -0,0 +1,17 @@ +name: ptypy_cupy +channels: + - conda-forge +dependencies: + - python=3.9 + - numpy + - scipy + - matplotlib + - h5py + - pyzmq + - mpi4py + - pillow + - pyfftw + - cupy + - cudatoolkit-dev + - pip + - compilers \ No newline at end of file diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py new file mode 100644 index 000000000..c3cb39c09 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py @@ -0,0 +1,804 @@ +# -*- coding: utf-8 -*- +""" +Maximum Likelihood reconstruction engine. + +TODO. + + * Implement other regularizers + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import numpy as np +import cupy as cp +import cupyx + +from ptypy.engines import register +from ptypy.accelerate.base.engines.ML_serial import ML_serial, BaseModelSerial +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from .. import get_context, log_device_memory_stats +from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel +from ..mem_utils import GpuDataManager + +#from ..mem_utils import GpuDataManager +from ptypy.accelerate.base import address_manglers + +__all__ = ['ML_cupy'] + +# can be used to limit the number of blocks, simulating that they don't fit +MAX_BLOCKS = 99999 +# MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit + + +@register() +class ML_cupy(ML_serial): + + """ + Defaults: + + [probe_update_cuda_atomics] + default = False + type = bool + help = For GPU, use the atomics version for probe update kernel + + [object_update_cuda_atomics] + default = True + type = bool + help = For GPU, use the atomics version for object update kernel + + [fft_lib] + default = cuda + type = str + help = Choose the cupy-compatible FFT module. + doc = One of: + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'cupy'`` : cupy using cufft (fast load, slowest compute due to additional store/load stages) + choices = 'cuda','cupy' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Maximum likelihood reconstruction engine. + """ + super().__init__(ptycho_parent, pars) + + def engine_initialize(self): + """ + Prepare for ML reconstruction. + """ + self.queue = get_context(new_queue=True) + + self.qu_htod = cp.cuda.Stream() + self.qu_dtoh = cp.cuda.Stream() + + self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None + + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + super().engine_initialize() + # self._setup_kernels() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + AUK = ArrayUtilsKernel(queue=self.queue) + self._dot_kernel = AUK.dot + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = scan.max_frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple([int(s) for s in geo.shape]) + aux = cp.zeros(ash, dtype=np.complex64) + kern.aux = aux + kern.a = cp.zeros(ash, dtype=np.complex64) + kern.b = cp.zeros(ash, dtype=np.complex64) + + # setup kernels, one for each SCAN. + kern.GDK = GradientDescentKernel( + aux, nmodes, queue=self.queue, math_type="double") + kern.GDK.allocate() + + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + kern.TK = TransposeKernel(queue=self.queue) + + kern.PROP = PropagationKernel( + aux, geo.propagator, queue_thread=self.queue, fft_type=self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + kern.PCK = PositionCorrectionKernel( + aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + + mag_mem = 0 + for scan, kern in self.kernels.items(): + mag_mem = max(kern.aux.nbytes // 2, mag_mem) + ma_mem = mag_mem + blk = ma_mem + mag_mem + + # We need to add the free memory from the pool to the free device memory, + # as both will be used for allocations + mempool = cp.get_default_memory_pool() + mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes() + + # leave 200MB room for safety + fit = int(mem - 200 * 1024 * 1024) // blk + if not fit: + log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...") + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nma = min(fit, MAX_BLOCKS) + log_device_memory_stats(4) + log(4, 'CuPy max blocks fitting on GPU: ma_arrays={}'.format(nma)) + # reset memory or create new + self.w_data = GpuDataManager(ma_mem, 0, nma, False) + self.I_data = GpuDataManager(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super().engine_prepare() + ## Serialize new data ## + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # recursive copy to gpu for probe and object + for _cname, c in self.ptycho.containers.items(): + if c.original != self.pr and c.original != self.ob: + continue + for _sname, s in c.S.items(): + # convert data here + s.gpu = cp.asarray(s.data) + s.cpu = cupyx.empty_pinned( + s.data.shape, s.data.dtype, order="C") + s.cpu[:] = s.data + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.fic_gpu = cp.ones_like(prep.err_phot_gpu) + + if use_tiles: + prep.addr2 = np.ascontiguousarray( + np.transpose(prep.addr, (2, 3, 0, 1))) + + prep.addr_gpu = cp.asarray(prep.addr) + if self.do_position_refinement: + prep.original_addr_gpu = cp.asarray(prep.original_addr) + prep.error_state_gpu = cp.empty_like(prep.err_phot_gpu) + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + # Todo: Which address to pick? + if use_tiles: + prep.addr2_gpu = cp.asarray(prep.addr2) + + prep.I = cupyx.empty_pinned(d.data.shape, d.data.dtype, order="C") + prep.I[:] = d.data + + # Todo: avoid that extra copy of data + if self.do_position_refinement: + ma = self.ma.S[d.ID].data.astype(np.float32) + prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C") + prep.ma[:] = ma + + log(4, 'Free memory on device: %.2f GB' % + (float(cp.cuda.runtime.memGetInfo()[0])/1e9)) + self.w_data.add_data_block() + self.I_data.add_data_block() + + self.dID_list = list(self.di.S.keys()) + + def _initialize_model(self): + + # Create noise model + if self.p.ML_type.lower() == "gaussian": + self.ML_model = GaussianModel(self) + elif self.p.ML_type.lower() == "poisson": + raise NotImplementedError('Poisson norm model not yet implemented') + elif self.p.ML_type.lower() == "euclid": + raise NotImplementedError('Euclid norm model not yet implemented') + else: + raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) + + def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): + """ + Overloading the context of Storage.data here, to allow for in-place math on Container instances: + """ + if container is not None: + if container.original == self.pr or container.original == self.ob: + for s in container.S.values(): + # convert data here + if dev == 'gpu': + s.data = s.gpu + if sync_copy: + s.gpu.set(s.cpu) + elif dev == 'cpu': + s.data = s.cpu + if sync_copy: + s.gpu.get(out=s.cpu) + #print('%s to cpu' % s.ID) + else: + for container in self.ptycho.containers.values(): + self._set_pr_ob_ref_for_data( + dev=dev, container=container, sync_copy=sync_copy) + + def _get_smooth_gradient(self, data, sigma): + if self.GSK.tmp is None: + self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64) + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + return data + + def _replace_ob_grad(self): + new_ob_grad = self.ob_grad_new + # Smoothing preconditioner + if self.smooth_gradient: + self.smooth_gradient.sigma *= (1. - self.p.smooth_gradient_decay) + for name, s in new_ob_grad.storages.items(): + s.gpu = self._get_smooth_gradient( + s.gpu, self.smooth_gradient.sigma) + + return self._replace_grad(self.ob_grad, new_ob_grad) + + def _replace_pr_grad(self): + new_pr_grad = self.pr_grad_new + # probe support + if self.p.probe_update_start <= self.curiter: + # Apply probe support if needed + for name, s in new_pr_grad.storages.items(): + self.support_constraint(s) + else: + new_pr_grad.fill(0.) + + return self._replace_grad(self.pr_grad, new_pr_grad) + + def _replace_grad(self, grad, new_grad): + norm = np.double(0.) + dot = np.double(0.) + for name, new in new_grad.storages.items(): + old = grad.storages[name] + norm += self._dot_kernel(new.gpu, new.gpu).get()[0] + dot += self._dot_kernel(new.gpu, old.gpu).get()[0] + old.gpu[:] = new.gpu + return norm, dot + + def engine_iterate(self, num=1): + err = super().engine_iterate(num) + # copy all data back to cpu + self._set_pr_ob_ref_for_data(dev='cpu', container=None, sync_copy=True) + return err + + def position_update(self): + """ + Position refinement + """ + if not self.do_position_refinement or (not self.curiter): + return + do_update_pos = (self.p.position_refinement.stop > + self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % + self.p.position_refinement.interval) == 0 + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.dID_list: + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + err_phot = prep.err_phot_gpu + error_state = prep.error_state_gpu + + # copy intensities and weights to GPU + ev_w, w, data_w = self.w_data.to_gpu( + prep.weights, dID, self.qu_htod) + ev, I, data_I = self.I_data.to_gpu(prep.I, dID, self.qu_htod) + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + PCK.queue.wait_event(ev) + # w & I now on device + PCK.log_likelihood_ml(aux, addr, I, w, err_phot) + cp.cuda.runtime.memcpy(dst=error_state.data.ptr, + src=err_phot.data.ptr, + size=err_phot.nbytes, + kind=3) # d2d + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address( + i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + PCK.log_likelihood_ml(aux, mangled_addr, I, w, err_phot) + PCK.update_addr_and_error_state( + addr, error_state, mangled_addr, err_phot) + + data_w.record_done(self.queue, 'compute') + data_I.record_done(self.queue, 'compute') + cp.cuda.runtime.memcpy(dst=err_phot.data.ptr, + src=error_state.data.ptr, + size=err_phot.nbytes, + kind=3) # d2d + if use_tiles: + s1 = addr.shape[0] * addr.shape[1] + s2 = addr.shape[2] * addr.shape[3] + TK.transpose(addr.reshape(s1, s2), + prep.addr2_gpu.reshape(s2, s1)) + + self.dID_list.reverse() + + def support_constraint(self, storage=None): + """ + Enforces 2D support constraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel( + supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel( + support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.w_data = None + self.I_data = None + + for name, s in self.pr.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for name, s in self.ob.S.items(): + s.data = s.gpu.get() # need this, otherwise getting segfault once context is detached + # no longer need those + del s.gpu + del s.cpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + prep.float_intens_coeff = prep.fic_gpu.get() + + # self.queue.synchronize() + super().engine_finalize() + + log_device_memory_stats(4) + + +class GaussianModel(BaseModelSerial): + """ + Gaussian noise model. + TODO: feed actual statistical weights instead of using the Poisson statistic heuristic. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Gaussian model. + """ + super(GaussianModel, self).__init__(MLengine) + + if self.p.reg_del2: + self.regularizer = Regul_del2_cupy( + self.p.reg_del2_amplitude, + queue=self.engine.queue + ) + else: + self.regularizer = None + + def prepare(self): + + super(GaussianModel, self).prepare() + + for label, d in self.engine.ptycho.new_data: + prep = self.engine.diff_info[d.ID] + w = (self.Irenorm * self.engine.ma.S[d.ID].data + / (1. / self.Irenorm + d.data)).astype(d.data.dtype) + prep.weights = cupyx.empty_pinned(w.shape, w.dtype, order="C") + prep.weights[:] = w + + def __del__(self): + """ + Clean up routine + """ + super(GaussianModel, self).__del__() + + def new_grad(self): + """ + Compute a new gradient direction according to a Gaussian noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + ob_grad = self.engine.ob_grad_new + pr_grad = self.engine.pr_grad_new + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + self.engine._set_pr_ob_ref_for_data('gpu') + ob_grad << 0. + pr_grad << 0. + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + POK = kern.POK + aux = kern.aux + + FW = kern.PROP.fw + BW = kern.PROP.bw + + # get addresses and auxilliary array + addr = prep.addr_gpu + fic = prep.fic_gpu + + err_phot = prep.err_phot_gpu + # local references + ob = self.engine.ob.S[oID].data + obg = ob_grad.S[oID].data + pr = self.engine.pr.S[pID].data + prg = pr_grad.S[pID].data + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu( + prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(aux, addr, ob, pr, add=False) + + # forward prop + FW(aux, aux) + GDK.make_model(aux, addr) + + queue.wait_event(ev) + + if self.p.floating_intensities: + GDK.floating_intensity(addr, w, I, fic) + + GDK.main(aux, addr, w, I) + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + GDK.error_reduce(addr, err_phot) + + BW(aux, aux) + + use_atomics = self.p.object_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.ob_update_ML(addr, obg, pr, aux, atomics=use_atomics) + + use_atomics = self.p.probe_update_cuda_atomics + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + POK.pr_update_ML(addr, prg, ob, aux, atomics=use_atomics) + + queue.synchronize() + self.engine.dID_list.reverse() + + # TODO we err_phot.sum, but not necessarily this error_dct until the end of contiguous iteration + for dID, prep in self.engine.diff_info.items(): + err_phot = prep.err_phot_gpu.get() + LL += err_phot.sum() + err_phot /= np.prod(prep.weights.shape[-2:]) + err_fourier = np.zeros_like(err_phot) + err_exit = np.zeros_like(err_phot) + errs = np.ascontiguousarray( + np.vstack([err_fourier, err_phot, err_exit]).T) + error_dct.update(zip(prep.view_IDs, errs)) + + # MPI reduction of gradients + + # DtoH copies + for s in ob_grad.S.values(): + s.gpu.get(out=s.cpu) + for s in pr_grad.S.values(): + s.gpu.get(out=s.cpu) + self.engine._set_pr_ob_ref_for_data('cpu') + + ob_grad.allreduce() + pr_grad.allreduce() + parallel.allreduce(LL) + + # HtoD cause we continue on gpu + for s in ob_grad.S.values(): + s.gpu.set(s.cpu) + for s in pr_grad.S.values(): + s.gpu.set(s.cpu) + self.engine._set_pr_ob_ref_for_data('gpu') + + # Object regularizer + if self.regularizer: + for name, s in self.engine.ob.storages.items(): + ob_grad.storages[name].data += self.regularizer.grad(s.data) + LL += self.regularizer.LL + + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs(self, c_ob_h, c_pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h + """ + self.engine._set_pr_ob_ref_for_data('gpu') + qu_htod = self.engine.qu_htod + queue = self.engine.queue + + # does not accept np.longdouble + B = cp.zeros((3,), dtype=np.float32) + Brenorm = 1. / self.LL[0] ** 2 + + # Outer loop: through diffraction patterns + for dID in self.engine.dID_list: + prep = self.engine.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.engine.kernels[prep.label] + GDK = kern.GDK + AWK = kern.AWK + + f = kern.aux + a = kern.a + b = kern.b + + FW = kern.PROP.fw + + # get addresses and auxiliary arrays + addr = prep.addr_gpu + fic = prep.fic_gpu + + # Schedule w & I to device + ev_w, w, data_w = self.engine.w_data.to_gpu( + prep.weights, dID, qu_htod) + ev, I, data_I = self.engine.I_data.to_gpu(prep.I, dID, qu_htod) + + # local references + ob = self.ob.S[oID].data + ob_h = c_ob_h.S[oID].data + pr = self.pr.S[pID].data + pr_h = c_pr_h.S[pID].data + + # make propagated exit (to buffer) + AWK.build_aux_no_ex(f, addr, ob, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob_h, pr, add=False) + AWK.build_aux_no_ex(a, addr, ob, pr_h, add=True) + AWK.build_aux_no_ex(b, addr, ob_h, pr_h, add=False) + + # forward prop + FW(f, f) + FW(a, a) + FW(b, b) + + queue.wait_event(ev) + + GDK.make_a012(f, a, b, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B) + + data_w.record_done(queue, 'compute') + data_I.record_done(queue, 'compute') + + queue.synchronize() + self.engine.dID_list.reverse() + + B = B.get() + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + c_ob_h.storages[name].data, s.data) + + self.B = B + + return B + + +class Regul_del2_cupy(object): + """\ + Squared gradient regularizer (Gaussian prior). + + This class applies to any numpy array. + """ + + def __init__(self, amplitude, axes=[-2, -1], queue=None): + # Regul.__init__(self, axes) + self.axes = axes + self.amplitude = amplitude + self.delxy = None + self.g = None + self.LL = None + self.queue = queue + self.AUK = ArrayUtilsKernel(queue=queue) + self.DELK_c = DerivativesKernel(np.complex64, queue=queue) + self.DELK_f = DerivativesKernel(np.float32, queue=queue) + + + def empty(x): return cp.empty( + x.shape, x.dtype) + + def delxb(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxb(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxb(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxb = delxb + + def delxf(x, axis=-1): + out = empty(x) + if x.dtype == np.float32: + self.DELK_f.delxf(x, out, axis) + elif x.dtype == np.complex64: + self.DELK_c.delxf(x, out, axis) + else: + raise TypeError("Type %s invalid for derivatives" % x.dtype) + return out + + self.delxf = delxf + self.norm = lambda x: self.AUK.norm2(x).get().item() + self.dot = lambda x, y: self.AUK.dot(x, y).get().item() + + self._grad_reg_kernel = cp.ElementwiseKernel( + "float32 fac, complex64 py, complex64 px, complex64 my, complex64 mx", + "complex64 out", + "out = (px+py-my-mx) * fac", + "grad_reg", + no_return=True + ) + + def grad(amp, px, py, mx, my): + out = empty(px) + if self.queue is not None: + self.queue.use() + self._grad_reg_kernel(amp, py, px, mx, my, out) + return out + self.reg_grad = grad + + def grad(self, x): + """ + Compute and return the regularizer gradient given the array x. + """ + ax0, ax1 = self.axes + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + self.delxy = [del_xf, del_yf, del_xb, del_yb] + + # TODO this one might be slow, maybe try with elementwise kernel + #self.g = (del_xb + del_yb - del_xf - del_yf) * 2. * self.amplitude + self.g = self.reg_grad(2. * self.amplitude, + del_xb, del_yb, del_xf, del_yf) + + self.LL = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + return self.g + + def poly_line_coeffs(self, h, x=None): + ax0, ax1 = self.axes + if x is None: + del_xf, del_yf, del_xb, del_yb = self.delxy + else: + del_xf = self.delxf(x, axis=ax0) + del_yf = self.delxf(x, axis=ax1) + del_xb = self.delxb(x, axis=ax0) + del_yb = self.delxb(x, axis=ax1) + + hdel_xf = self.delxf(h, axis=ax0) + hdel_yf = self.delxf(h, axis=ax1) + hdel_xb = self.delxb(h, axis=ax0) + hdel_yb = self.delxb(h, axis=ax1) + + c0 = self.amplitude * (self.norm(del_xf) + + self.norm(del_yf) + + self.norm(del_xb) + + self.norm(del_yb)) + + c1 = 2 * self.amplitude * (self.dot(del_xf, hdel_xf) + + self.dot(del_yf, hdel_yf) + + self.dot(del_xb, hdel_xb) + + self.dot(del_yb, hdel_yb)) + + c2 = self.amplitude * (self.norm(hdel_xf) + + self.norm(hdel_yf) + + self.norm(hdel_xb) + + self.norm(hdel_yb)) + + self.coeff = np.array([c0, c1, c2]) + return self.coeff diff --git a/ptypy/accelerate/cuda_cupy/engines/__init__.py b/ptypy/accelerate/cuda_cupy/engines/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py new file mode 100644 index 000000000..f0c6ba40a --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py @@ -0,0 +1,636 @@ +# -*- coding: utf-8 -*- +""" +Difference Map reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" + +import numpy as np +import time +import cupy as cp + +from ptypy import utils as u +from ptypy.accelerate.cuda_cupy import get_context, log_device_memory_stats +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.engines.projectional import DMMixin, RAARMixin +from ptypy.accelerate.base.engines import projectional_serial +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel +from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\ + TransposeKernel, ClipMagnitudesKernel, MassCenterKernel, Abs2SumKernel,\ + InterpolatedShiftKernel +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..multi_gpu import get_multi_gpu_communicator + +__all__ = ['DM_cupy', 'RAAR_cupy'] + + +class _ProjectionEngine_cupy(projectional_serial._ProjectionEngine_serial): + + """ + Defaults: + + [probe_update_cuda_atomics] + default = False + type = bool + help = For GPU, use the atomics version for probe update kernel + + [object_update_cuda_atomics] + default = True + type = bool + help = For GPU, use the atomics version for object update kernel + + [fft_lib] + default = cuda + type = str + help = Choose the pycuda-compatible FFT module. + doc = One of: + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'cupy'`` : cupy's cuFFT wrapper (fast load, slowest compute due to additional store/load stages) + choices = 'cuda','cupy' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Difference map reconstruction engine. + """ + super().__init__(ptycho_parent, pars) + self.multigpu = None + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + # Context, Multi GPU communicator and Stream (needs to be in this order) + self.queue = get_context(new_queue=False) + self.multigpu = get_multi_gpu_communicator() + + # Gaussian Smoothing Kernel + self.GSK = GaussianSmoothingKernel(queue=self.queue) + + # Real/Fourier Support Kernel + self.RSK = {} + self.FSK = {} + + # Clip Magnitudes Kernel + self.CMK = ClipMagnitudesKernel(queue=self.queue) + + # initialise kernels for centring probe if required + if self.p.probe_center_tol is not None: + # mass center kernel + self.MCK = MassCenterKernel(queue=self.queue) + # absolute sum kernel + self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue) + # interpolated shift kernel + self.ISK = InterpolatedShiftKernel(queue=self.queue) + + super().engine_initialize() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = scan.max_frames_per_block + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (fpc * nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = cp.asarray(aux) + + # setup kernels, one for each SCAN. + log(4, "Setting up FourierUpdateKernel") + kern.FUK = FourierUpdateKernel( + aux, nmodes, queue_thread=self.queue) + kern.FUK.allocate() + + log(4, "Setting up PoUpdateKernel") + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + log(4, "Setting up AuxiliaryWaveKernel") + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + log(4, "Setting up ArrayUtilsKernel") + kern.AUK = ArrayUtilsKernel(queue=self.queue) + + log(4, "Setting up TransposeKernel") + kern.TK = TransposeKernel(queue=self.queue) + + log(4, "Setting up PropagationKernel") + kern.PROP = PropagationKernel( + aux, geo.propagator, self.queue, self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + log(4, "Setting up PositionCorrectionKernel") + kern.PCK = PositionCorrectionKernel( + aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + log(4, "Kernel setup completed") + + def engine_prepare(self): + + super().engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu = cp.asarray(s.data) # TODO: investigate if this can be pinned, it's much faster + for name, s in self.ob_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.ob_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # TODO : like the serialization this one is needed due to object reformatting + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = cp.asarray(prep.addr) + if use_tiles: + prep.addr2 = np.ascontiguousarray( + np.transpose(prep.addr, (2, 3, 0, 1))) + prep.addr2_gpu = cp.asarray(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + for label, d in self.ptycho.new_data: + prep = self.diff_info[d.ID] + pID, oID, eID = prep.poe_IDs + s = self.ex.S[eID] + s.gpu = cp.asarray(s.data) + s = self.ma.S[d.ID] + s.gpu = cp.asarray(s.data.astype(np.float32)) + + prep.mag = cp.asarray(prep.mag) + prep.ma_sum = cp.asarray(prep.ma_sum) + prep.err_fourier_gpu = cp.asarray(prep.err_fourier) + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.err_exit_gpu = cp.asarray(prep.err_exit) + if self.do_position_refinement: + prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu) + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + queue = self.queue + queue.use() + + for it in range(num): + error = {} + for dID in self.di.S.keys(): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + PROP = kern.PROP + + # get addresses and buffers + addr = prep.addr_gpu + mag = prep.mag + ma_sum = prep.ma_sum + err_fourier = prep.err_fourier_gpu + err_phot = prep.err_phot_gpu + err_exit = prep.err_exit_gpu + pbound = self.pbound_scan[prep.label] + aux = kern.aux + + # local references + ma = self.ma.S[dID].gpu + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + ex = self.ex.S[eID].gpu + + # compute log-likelihood + if self.p.compute_log_likelihood: + AWK.build_aux_no_ex(aux, addr, ob, pr) + PROP.fw(aux, aux) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + + # build auxilliary wave + #AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_aux(aux, addr, ob, pr, ex, + c_po=self._c, c_e=1-self._c) + + # forward FFT + PROP.fw(aux, aux) + + # Deviation from measured data + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update(aux, addr, mag, ma, err_fourier, pbound) + + # backward FFT + PROP.bw(aux, aux) + + # build exit wave + #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, + c_po=self._a, c_e=-(self._a + self._b)) + FUK.exit_error(aux, addr) + FUK.error_reduce(addr, err_exit) + + parallel.barrier() + + sync = (self.curiter % 1 == 0) + self.overlap_update() + + self.center_probe() + + parallel.barrier() + self.position_update() + + self.curiter += 1 + queue.synchronize() + + for name, s in self.ob.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + for name, s in self.pr.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + + queue.synchronize() + + # costly but needed to sync back with + # for name, s in self.ex.S.items(): + # s.data[:] = s.gpu.get() + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray( + np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + self.error = error + return error + + def position_update(self): + """ + Position refinement + """ + if not self.do_position_refinement or (not self.curiter): + return + do_update_pos = (self.p.position_refinement.stop > + self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % + self.p.position_refinement.interval) == 0 + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # Update positions + if do_update_pos: + self.queue.use() + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.di.S.keys(): + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ma = self.ma.S[dID].gpu + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + mag = prep.mag + ma_sum = prep.ma_sum + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr, + src=err_fourier.data.ptr, + size=err_fourier.nbytes, + kind=3, # device to device + stream=self.queue.ptr) + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address( + i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood( + aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state( + addr, error_state, mangled_addr, err_fourier) + + cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr, + src=error_state.data.ptr, + size=err_fourier.nbytes, + kind=3, + stream=self.queue.ptr) # d2d + if use_tiles: + s1 = addr.shape[0] * addr.shape[1] + s2 = addr.shape[2] * addr.shape[3] + TK.transpose(addr.reshape(s1, s2), + prep.addr2_gpu.reshape(s2, s1)) + + def center_probe(self): + if self.p.probe_center_tol is not None: + self.queue.use() + for name, pr_s in self.pr.storages.items(): + psum_d = self.A2SK.abs2sum(pr_s.gpu) + c1 = self.MCK.mass_center(psum_d).get() + c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype) + + shift = c2 - c1 + # exit if the current center of mass is within the tolerance + if u.norm(shift) < self.p.probe_center_tol: + break + + # shift the probe + pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift) + + # shift the object + ob_s = pr_s.views[0].pod.ob_view.storage + ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift) + + # shift the exit waves + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + if pID == name: + self.ex.S[eID].gpu = self.ISK.interpolate_shift( + self.ex.S[eID].gpu, shift) + + log(4, 'Probe recentered from %s to %s' + % (str(tuple(c1)), str(tuple(c2)))) + + # object update + + def object_update(self, MPI=False): + use_atomics = self.p.object_update_cuda_atomics + queue = self.queue + queue.synchronize() + queue.use() + for oID, ob in self.ob.storages.items(): + obn = self.ob_nrm.S[oID] + cfact = self.ob_cfact[oID] + + if self.p.obj_smooth_std is not None: + log(4, 'Smoothing object, cfact is %.2f' % cfact) + obb = self.ob_buf.S[oID] + smooth_mfs = [self.p.obj_smooth_std, self.p.obj_smooth_std] + self.GSK.convolution(ob.gpu, smooth_mfs, tmp=obb.gpu) + + ob.gpu *= cfact + obn.gpu.fill(cfact) + queue.synchronize() + + # storage for-loop + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + + POK = self.kernels[prep.label].POK + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # scan for loop + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + ev = POK.ob_update(addr, + self.ob.S[oID].gpu, + self.ob_nrm.S[oID].gpu, + self.pr.S[pID].gpu, + self.ex.S[eID].gpu, + atomics=use_atomics) + queue.synchronize() + + for oID, ob in self.ob.storages.items(): + obn = self.ob_nrm.S[oID] + self.multigpu.allReduceSum(ob.gpu) + self.multigpu.allReduceSum(obn.gpu) + with queue: + ob.gpu /= obn.gpu + + self.clip_object(ob.gpu) + queue.synchronize() + + # probe update + def probe_update(self, MPI=False): + queue = self.queue + + # storage for-loop + change_gpu = cp.zeros((1,), dtype=np.float32) + cfact = self.p.probe_inertia + use_atomics = self.p.probe_update_cuda_atomics + for pID, pr in self.pr.storages.items(): + prn = self.pr_nrm.S[pID] + cfact = self.pr_cfact[pID] + pr.gpu *= cfact + prn.gpu.fill(cfact) + + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + + POK = self.kernels[prep.label].POK + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # scan for-loop + addr = prep.addr_gpu if use_atomics else prep.addr2_gpu + ev = POK.pr_update(addr, + self.pr.S[pID].gpu, + self.pr_nrm.S[pID].gpu, + self.ob.S[oID].gpu, + self.ex.S[eID].gpu, + atomics=use_atomics) + queue.synchronize() + + for pID, pr in self.pr.storages.items(): + + buf = self.pr_buf.S[pID] + prn = self.pr_nrm.S[pID] + + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + # calculate change on GPU + queue.synchronize() + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size + + return np.sqrt(change) + + def support_constraint(self, storage=None): + """ + Enforces 2D support constraint on probe. + """ + if storage is None: + for s in self.pr.storages.values(): + self.support_constraint(s) + + # Fourier space + support = self._probe_fourier_support.get(storage.ID) + if support is not None: + if storage.ID not in self.FSK: + supp = support.astype(np.complex64) + self.FSK[storage.ID] = FourierSupportKernel( + supp, self.queue, self.p.fft_lib) + self.FSK[storage.ID].allocate() + self.FSK[storage.ID].apply_fourier_support(storage.gpu) + + # Real space + support = self._probe_support.get(storage.ID) + if support is not None: + if storage.ID not in self.RSK: + self.RSK[storage.ID] = RealSupportKernel( + support.astype(np.complex64)) + self.RSK[storage.ID].allocate() + self.RSK[storage.ID].apply_real_support(storage.gpu) + + def clip_object(self, ob): + """ + Clips magnitudes of object into given range. + """ + if self.p.clip_object is not None: + cmin, cmax = self.p.clip_object + self.CMK.clip_magnitudes_to_range(ob, cmin, cmax) + + def engine_finalize(self): + """ + clear GPU data and destroy context. + """ + # revert page-locked memory + delete GPU memory + for name, s in self.ob.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.ob_buf.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.ob_nrm.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.pr_buf.S.items(): + s.data = np.copy(s.data) + del s.gpu + for name, s in self.pr_nrm.S.items(): + s.data = np.copy(s.data) + del s.gpu + + # copy addr to cpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + del prep.addr_gpu + + + mempool = cp.get_default_memory_pool() + mempool.free_all_blocks() + pinned_pool = cp.get_default_pinned_memory_pool() + pinned_pool.free_all_blocks() + + + # we don't need the "benchmarking" in DM_serial + super().engine_finalize(benchmark=False) + + +@register(name="DM_cupy_nostream") +class DM_cupy(_ProjectionEngine_cupy, DMMixin): + """ + A full-fledged Difference Map engine accelerated with pycuda. + + Defaults: + + [name] + default = DM_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _ProjectionEngine_cupy.__init__(self, ptycho_parent, pars) + DMMixin.__init__(self, self.p.alpha) + ptycho_parent.citations.add_article(**self.article) + + +@register(name="RAAR_cupy_nostream") +class RAAR_cupy(_ProjectionEngine_cupy, RAARMixin): + """ + A RAAR engine in accelerated with pycuda. + + Defaults: + + [name] + default = RAAR_pycuda + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _ProjectionEngine_cupy.__init__(self, ptycho_parent, pars) + RAARMixin.__init__(self, self.p.beta) diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py new file mode 100644 index 000000000..b64ad5e82 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py @@ -0,0 +1,556 @@ +# -*- coding: utf-8 -*- +""" +Difference Map reconstruction engine for NVIDIA GPUs. + +This engine uses three streams, one for the compute queue and one for each I/O queue. +Events are used to synchronize download / compute/ upload. we cannot manipulate memory +for each loop over the state vector, a certain number of memory sections is preallocated +and reused. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" + +import numpy as np +import time +import cupy as cp +import cupyx + +from ptypy import utils as u +from ptypy.accelerate.cuda_cupy import log_device_memory_stats +from ptypy.utils.verbose import log, logger +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.engines.projectional import DMMixin, RAARMixin +from . import projectional_cupy + +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..mem_utils import GpuDataManager + +EX_MA_BLOCKS_RATIO = 2 +# can be used to limit the number of blocks, simulating that they don't fit +MAX_BLOCKS = 99999 +# MAX_BLOCKS = 3 # can be used to limit the number of blocks, simulating that they don't fit + +__all__ = ['DM_cupy_stream', 'RAAR_cupy_stream'] + + +class _ProjectionEngine_cupy_stream(projectional_cupy._ProjectionEngine_cupy): + + def __init__(self, ptycho_parent, pars=None): + + super().__init__(ptycho_parent, pars) + self.ma_data = None + self.mag_data = None + self.ex_data = None + + def engine_initialize(self): + super().engine_initialize() + self.qu_htod = cp.cuda.Stream() + self.qu_dtoh = cp.cuda.Stream() + + def _setup_kernels(self): + + super()._setup_kernels() + ex_mem = 0 + mag_mem = 0 + for scan, kern in self.kernels.items(): + ex_mem = max(kern.aux.nbytes, ex_mem) + mag_mem = max(kern.FUK.gpu.fdev.nbytes, mag_mem) + ma_mem = mag_mem + + blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem + + # We need to add the free memory from the pool to the free device memory, + # as both will be used for allocations + mempool = cp.get_default_memory_pool() + mem = cp.cuda.runtime.memGetInfo()[0] + mempool.total_bytes() - mempool.used_bytes() + + # leave 200MB room for safety + fit = int(mem - 200 * 1024 * 1024) // blk + if not fit: + log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...") + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) + nma = min(fit, MAX_BLOCKS) + log_device_memory_stats(4) + log(4, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format( + nex, nma)) + # reset memory or create new + self.ex_data = GpuDataManager(ex_mem, 0, nex, True) + self.ma_data = GpuDataManager(ma_mem, 0, nma, False) + self.mag_data = GpuDataManager(mag_mem, 0, nma, False) + + def engine_prepare(self): + + super(projectional_cupy._ProjectionEngine_cupy, self).engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu = cp.asarray(s.data) + for name, s in self.ob_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.ob_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_buf.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr_nrm.S.items(): + s.gpu, s.data = mppa(s.data) + + use_tiles = (not self.p.probe_update_cuda_atomics) or ( + not self.p.object_update_cuda_atomics) + + # Extra object buffer for smoothing kernel + if self.p.obj_smooth_std is not None: + for name, s in self.ob_buf.S.items(): + s.tmp = cp.empty(s.gpu.shape, s.gpu.dtype) + + # TODO : like the serialization this one is needed due to object reformatting + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = cp.asarray(prep.addr) + if use_tiles: + prep.addr2 = np.ascontiguousarray( + np.transpose(prep.addr, (2, 3, 0, 1))) + prep.addr2_gpu = cp.asarray(prep.addr2) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + for label, d in self.ptycho.new_data: + dID = d.ID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + prep.ma_sum_gpu = cp.asarray(prep.ma_sum) + # prepare page-locked mems: + prep.err_fourier_gpu = cp.asarray(prep.err_fourier) + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.err_exit_gpu = cp.asarray(prep.err_exit) + if self.do_position_refinement: + prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu) + ma = self.ma.S[dID].data.astype(np.float32) + prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C") + prep.ma[:] = ma + ex = self.ex.S[eID].data + prep.ex = cupyx.empty_pinned(ex.shape, ex.dtype, order="C") + prep.ex[:] = ex + mag = prep.mag + prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C") + prep.mag[:] = mag + + log(4, 'Free memory on device: %.2f GB' % + (float(cp.cuda.runtime.memGetInfo()[0])/1e9)) + self.ex_data.add_data_block() + self.ma_data.add_data_block() + self.mag_data.add_data_block() + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + # ma_buf = ma_c = np.zeros(FUK.fshape, dtype=np.float32) + self.dID_list = list(self.di.S.keys()) + atomics_probe = self.p.probe_update_cuda_atomics + atomics_object = self.p.object_update_cuda_atomics + use_tiles = (not atomics_object) or (not atomics_probe) + + for it in range(num): + + error = {} + + for inner in range(self.p.overlap_max_iterations): + + change = 0 + + do_update_probe = (self.curiter >= self.p.probe_update_start) + do_update_object = (self.p.update_object_first or ( + inner > 0) or not do_update_probe) + do_update_fourier = (inner == 0) + + # initialize probe and object buffer to receive an update + if do_update_object: + for oID, ob in self.ob.storages.items(): + cfact = self.ob_cfact[oID] + obn = self.ob_nrm.S[oID] + obb = self.ob_buf.S[oID] + + if self.p.obj_smooth_std is not None: + log(4, 'Smoothing object, cfact is %.2f' % cfact) + smooth_mfs = [self.p.obj_smooth_std, + self.p.obj_smooth_std] + # We need a third copy, because we still need ob.gpu for the fourier update + obb.gpu[:] = ob.gpu[:] + self.GSK.convolution( + obb.gpu, smooth_mfs, tmp=obb.tmp) + obb.gpu *= np.complex64(cfact) + else: + # obb.gpu[:] = ob.gpu * np.complex64(cfact) + cp.multiply(ob.gpu, np.complex64(cfact), out=obb.gpu) + obn.gpu.fill(np.float32(cfact)) + + # First cycle: Fourier + object update + for iblock, dID in enumerate(self.dID_list): + prep = self.diff_info[dID] + + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + + pbound = self.pbound_scan[prep.label] + aux = kern.aux + PROP = kern.PROP + + # get addresses and auxilliary array + addr = prep.addr_gpu + addr2 = prep.addr2_gpu if use_tiles else None + err_fourier = prep.err_fourier_gpu + err_phot = prep.err_phot_gpu + err_exit = prep.err_exit_gpu + ma_sum = prep.ma_sum_gpu + + # local references + ob = self.ob.S[oID].gpu + obn = self.ob_nrm.S[oID].gpu + obb = self.ob_buf.S[oID].gpu + pr = self.pr.S[pID].gpu + + # Schedule ex to device + ev_ex, ex, data_ex = self.ex_data.to_gpu( + prep.ex, dID, self.qu_htod) + + # Fourier update. + if do_update_fourier: + self.ex_data.syncback = True + log(4, '----- Fourier update -----', True) + + # Schedule ma & mag to device + ev_ma, ma, data_ma = self.ma_data.to_gpu( + prep.ma, dID, self.qu_htod) + ev_mag, mag, data_mag = self.mag_data.to_gpu( + prep.mag, dID, self.qu_htod) + + # compute log-likelihood + if self.p.compute_log_likelihood: + AWK.build_aux_no_ex(aux, addr, ob, pr) + PROP.fw(aux, aux) + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_mag) + FUK.log_likelihood(aux, addr, mag, ma, err_phot) + + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_ex) + #AWK.build_aux(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_aux(aux, addr, ob, pr, ex, + c_po=self._c, c_e=1-self._c) + + # FFT + PROP.fw(aux, aux) + + # Deviation from measured data + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_mag) + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + FUK.fmag_all_update( + aux, addr, mag, ma, err_fourier, pbound) + + data_mag.record_done(self.queue, 'compute') + data_ma.record_done(self.queue, 'compute') + + PROP.bw(aux, aux) + # apply changes + #AWK.build_exit(aux, addr, ob, pr, ex, alpha=self.p.alpha) + AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, + c_po=self._a, c_e=-(self._a + self._b)) + FUK.exit_error(aux, addr) + FUK.error_reduce(addr, err_exit) + + prestr = '%d Iteration (Overlap) #%02d: ' % ( + parallel.rank, inner) + + # Update object + if do_update_object: + log(4, prestr + '----- object update -----', True) + addrt = addr if atomics_object else addr2 + self.queue.wait_event(ev_ex) + POK.ob_update(addrt, obb, obn, pr, ex, + atomics=atomics_object) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + # swap direction + if do_update_fourier or do_update_object: + self.dID_list.reverse() + + # wait for compute stream to finish + self.queue.synchronize() + + if do_update_object: + + for oID, ob in self.ob.storages.items(): + obn = self.ob_nrm.S[oID] + obb = self.ob_buf.S[oID] + self.multigpu.allReduceSum(obb.gpu) + self.multigpu.allReduceSum(obn.gpu) + obb.gpu /= obn.gpu + + self.clip_object(obb.gpu) + ob.gpu[:] = obb.gpu + + # Exit if probe should not yet be updated + if not do_update_probe: + break + self.ex_data.syncback = False + + # Update probe + log(4, prestr + '----- probe update -----', True) + change = self.probe_update() + log(4, prestr + 'change in probe is %.3f' % change, True) + + # stop iteration if probe change is small + if change < self.p.overlap_converge_factor: + break + + self.queue.synchronize() + parallel.barrier() + + if self.do_position_refinement and (self.curiter): + do_update_pos = (self.p.position_refinement.stop > + self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % + self.p.position_refinement.interval) == 0 + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + log(4, "----------- START POS REF -------------") + for dID in self.di.S.keys(): + + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu + original_addr = prep.original_addr + mangled_addr = prep.mangled_addr_gpu + ma_sum = prep.ma_sum_gpu + err_fourier = prep.err_fourier_gpu + error_state = prep.error_state_gpu + + PCK = kern.PCK + TK = kern.TK + PROP = kern.PROP + + # Make sure our data arrays are on device + ev_ma, ma, data_ma = self.ma_data.to_gpu( + prep.ma, dID, self.qu_htod) + ev_mag, mag, data_mag = self.mag_data.to_gpu( + prep.mag, dID, self.qu_htod) + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + # wait for data to arrive + self.queue.wait_event(ev_mag) + + # We need to re-calculate the current error + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr, + src=err_fourier.data.ptr, + size=err_fourier.nbytes, + kind=3, # device to device + stream=self.queue.ptr) + + log(4, 'Position refinement trial: iteration %s' % + (self.curiter)) + PCK.mangler.setup_shifts( + self.curiter, nframes=addr.shape[0]) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address( + i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error( + aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood( + aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state( + addr, error_state, mangled_addr, err_fourier) + + data_mag.record_done(self.queue, 'compute') + data_ma.record_done(self.queue, 'compute') + cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr, + src=error_state.data.ptr, + size=err_fourier.nbytes, + kind=3, # d2d + stream=self.queue.ptr) + if use_tiles: + s1 = prep.addr_gpu.shape[0] * \ + prep.addr_gpu.shape[1] + s2 = prep.addr_gpu.shape[2] * \ + prep.addr_gpu.shape[3] + TK.transpose(prep.addr_gpu.reshape( + s1, s2), prep.addr2_gpu.reshape(s2, s1)) + + self.curiter += 1 + self.queue.synchronize() + + for name, s in self.ob.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + for name, s in self.pr.S.items(): + cp.asnumpy(s.gpu, stream=self.queue, out=s.data) + + self.queue.synchronize() + + # costly but needed to sync back with + # for name, s in self.ex.S.items(): + # s.data[:] = s.gpu.get() + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray( + np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + self.error = error + return error + + # probe update + def probe_update(self, MPI=False): + queue = self.queue + use_atomics = self.p.probe_update_cuda_atomics + # storage for-loop + change_gpu = cp.zeros((1,), dtype=np.float32) + for pID, pr in self.pr.storages.items(): + prn = self.pr_nrm.S[pID] + cfact = self.pr_cfact[pID] + with queue: + pr.gpu *= np.float32(cfact) + prn.gpu.fill(np.float32(cfact)) + + for iblock, dID in enumerate(self.dID_list): + prep = self.diff_info[dID] + + POK = self.kernels[prep.label].POK + # find probe, object in exit ID in dependence of dID + pID, oID, eID = prep.poe_IDs + + ev, ex, data_ex = self.ex_data.to_gpu(prep.ex, dID, self.qu_htod) + self.queue.wait_event(ev) + + addrt = prep.addr_gpu if use_atomics else prep.addr2_gpu + ev = POK.pr_update(addrt, + self.pr.S[pID].gpu, + self.pr_nrm.S[pID].gpu, + self.ob.S[oID].gpu, + ex, + atomics=use_atomics) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + self.dID_list.reverse() + + self.queue.synchronize() + self.queue.use() + for pID, pr in self.pr.storages.items(): + + buf = self.pr_buf.S[pID] + prn = self.pr_nrm.S[pID] + + self.multigpu.allReduceSum(pr.gpu) + self.multigpu.allReduceSum(prn.gpu) + pr.gpu /= prn.gpu + self.support_constraint(pr) + + # calculate change on GPU + AUK = self.kernels[list(self.kernels)[0]].AUK + buf.gpu -= pr.gpu + change_gpu += (AUK.norm2(buf.gpu) / AUK.norm2(pr.gpu)) + buf.gpu[:] = pr.gpu + self.multigpu.allReduceSum(change_gpu) + change = change_gpu.get().item() / parallel.size + + return np.sqrt(change) + + def engine_finalize(self): + """ + Clear all GPU data, pinned memory, etc + """ + self.ex_data = None + self.ma_data = None + self.mag_data = None + + super().engine_finalize() + + log_device_memory_stats(4) + +@register(name="DM_cupy") +class DM_cupy_stream(_ProjectionEngine_cupy_stream, DMMixin): + """ + A full-fledged Difference Map engine accelerated with cupy. + + Defaults: + + [name] + default = DM_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _ProjectionEngine_cupy_stream.__init__(self, ptycho_parent, pars) + DMMixin.__init__(self, self.p.alpha) + ptycho_parent.citations.add_article(**self.article) + + +@register(name="RAAR_cupy") +class RAAR_cupy_stream(_ProjectionEngine_cupy_stream, RAARMixin): + """ + A RAAR engine in accelerated with cupy. + + Defaults: + + [name] + default = RAAR_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + + _ProjectionEngine_cupy_stream.__init__(self, ptycho_parent, pars) + RAARMixin.__init__(self, self.p.beta) diff --git a/ptypy/accelerate/cuda_cupy/engines/stochastic.py b/ptypy/accelerate/cuda_cupy/engines/stochastic.py new file mode 100644 index 000000000..8af49d635 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/engines/stochastic.py @@ -0,0 +1,550 @@ +# -*- coding: utf-8 -*- +""" +Accelerated stochastic reconstruction engine. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" + +import numpy as np +import time +import cupy as cp +import cupyx + +from ptypy import utils as u +from ptypy.utils.verbose import logger, log +from ptypy.utils import parallel +from ptypy.engines import register +from ptypy.engines.stochastic import EPIEMixin, SDRMixin +from ptypy.accelerate.base.engines.stochastic import _StochasticEngineSerial +from ptypy.accelerate.base import address_manglers +from .. import get_context +from ..kernels import FourierUpdateKernel, AuxiliaryWaveKernel, PoUpdateKernel,\ + PositionCorrectionKernel, PropagationKernel +from ..array_utils import ArrayUtilsKernel, GaussianSmoothingKernel,\ + TransposeKernel, MaxAbs2Kernel, MassCenterKernel, Abs2SumKernel,\ + InterpolatedShiftKernel +from ..mem_utils import make_pagelocked_paired_arrays as mppa +from ..mem_utils import GpuDataManager + +MPI = False + +EX_MA_BLOCKS_RATIO = 2 +# can be used to limit the number of blocks, simulating that they don't fit +MAX_BLOCKS = 99999 +# MAX_BLOCKS = 10 # can be used to limit the number of blocks, simulating that they don't fit + + +class _StochasticEngineCupy(_StochasticEngineSerial): + + """ + An accelerated implementation of a stochastic algorithm for ptychography + + Defaults: + + [fft_lib] + default = cuda + type = str + help = Choose the cupy-compatible FFT module. + doc = One of: + - ``'cuda'`` : ptypy's cuda wrapper (delayed load, but fastest compute if all data is on GPU) + - ``'cupy'`` : cupy's cufft wrapper (fast load, slowest compute due to additional store/load stages) + choices = 'cuda','cupy' + userlevel = 2 + + """ + + def __init__(self, ptycho_parent, pars=None): + """ + Accelerated base engine for stochastic algorithms. + """ + super().__init__(ptycho_parent, pars) + self.ma_data = None + self.mag_data = None + self.ex_data = None + + def engine_initialize(self): + """ + Prepare for reconstruction. + """ + self.queue = get_context(new_queue=True) + + # initialise kernels for centring probe if required + if self.p.probe_center_tol is not None: + # mass center kernel + self.MCK = MassCenterKernel(queue=self.queue) + # absolute sum kernel + self.A2SK = Abs2SumKernel(dtype=self.pr.dtype, queue=self.queue) + # interpolated shift kernel + self.ISK = InterpolatedShiftKernel(queue=self.queue) + + super().engine_initialize() + self.qu_htod = cp.cuda.Stream() + self.qu_dtoh = cp.cuda.Stream() + + def _setup_kernels(self): + """ + Setup kernels, one for each scan. Derive scans from ptycho class + """ + fpc = 0 + + # get the scans + for label, scan in self.ptycho.model.scans.items(): + + kern = u.Param() + kern.scanmodel = type(scan).__name__ + self.kernels[label] = kern + # TODO: needs to be adapted for broad bandwidth + geo = scan.geometries[0] + + # Get info to shape buffer arrays + fpc = max(scan.max_frames_per_block, fpc) + + # TODO : make this more foolproof + try: + nmodes = scan.p.coherence.num_probe_modes * \ + scan.p.coherence.num_object_modes + except: + nmodes = 1 + + # create buffer arrays + ash = (nmodes,) + tuple(geo.shape) + aux = np.zeros(ash, dtype=np.complex64) + kern.aux = cp.asarray(aux) + + # setup kernels, one for each SCAN. + log(4, "Setting up FourierUpdateKernel") + kern.FUK = FourierUpdateKernel( + aux, nmodes, queue_thread=self.queue) + kern.FUK.fshape = (1,) + kern.FUK.fshape[1:] + kern.FUK.allocate() + + log(4, "Setting up PoUpdateKernel") + kern.POK = PoUpdateKernel(queue_thread=self.queue) + kern.POK.allocate() + + log(4, "Setting up AuxiliaryWaveKernel") + kern.AWK = AuxiliaryWaveKernel(queue_thread=self.queue) + kern.AWK.allocate() + + log(4, "Setting up ArrayUtilsKernel") + kern.AUK = ArrayUtilsKernel(queue=self.queue) + + #log(4, "Setting up TransposeKernel") + #kern.TK = TransposeKernel(queue=self.queue) + + log(4, "setting up MaxAbs2Kernel") + kern.MAK = MaxAbs2Kernel(queue=self.queue) + + log(4, "Setting up PropagationKernel") + kern.PROP = PropagationKernel( + aux, geo.propagator, self.queue, self.p.fft_lib) + kern.PROP.allocate() + kern.resolution = geo.resolution[0] + + if self.do_position_refinement: + log(4, "Setting up position correction") + kern.PCK = PositionCorrectionKernel( + aux, nmodes, self.p.position_refinement, geo.resolution, queue_thread=self.queue) + kern.PCK.allocate() + + ex_mem = 0 + mag_mem = 0 + for scan, kern in self.kernels.items(): + if kern.scanmodel in ("GradFull", "BlockGradFull"): + ex_mem = max(kern.aux.nbytes * 1, ex_mem) + else: + ex_mem = max(kern.aux.nbytes * fpc, ex_mem) + mag_mem = max(kern.FUK.gpu.fdev.nbytes * fpc, mag_mem) + ma_mem = mag_mem + mem = cp.cuda.runtime.memGetInfo()[0] + blk = ex_mem * EX_MA_BLOCKS_RATIO + ma_mem + mag_mem + # leave 200MB room for safety + fit = int(mem - 200 * 1024 * 1024) // blk + if not fit: + log(1, "Cannot fit memory into device, if possible reduce frames per block. Exiting...") + self.context.pop() + self.context.detach() + raise SystemExit("ptypy has been exited.") + + # TODO grow blocks dynamically + nex = min(fit * EX_MA_BLOCKS_RATIO, MAX_BLOCKS) + nma = min(fit, MAX_BLOCKS) + + log(3, 'cupy max blocks fitting on GPU: exit arrays={}, ma_arrays={}'.format(nex, nma)) + # reset memory or create new + self.ex_data = GpuDataManager(ex_mem, 0, nex, True) + self.ma_data = GpuDataManager(ma_mem, 0, nma, False) + self.mag_data = GpuDataManager(mag_mem, 0, nma, False) + log(4, "Kernel setup completed") + + def engine_prepare(self): + super().engine_prepare() + + for name, s in self.ob.S.items(): + s.gpu, s.data = mppa(s.data) + for name, s in self.pr.S.items(): + s.gpu, s.data = mppa(s.data) + + for label, d in self.di.storages.items(): + prep = self.diff_info[d.ID] + prep.addr_gpu = cp.asarray(prep.addr) + if self.do_position_refinement: + prep.mangled_addr_gpu = prep.addr_gpu.copy() + + for label, d in self.ptycho.new_data: + dID = d.ID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + prep.ma_sum_gpu = cp.asarray(prep.ma_sum) + prep.err_fourier_gpu = cp.asarray(prep.err_fourier) + prep.err_phot_gpu = cp.asarray(prep.err_phot) + prep.err_exit_gpu = cp.asarray(prep.err_exit) + if self.do_position_refinement: + prep.error_state_gpu = cp.empty_like(prep.err_fourier_gpu) + prep.obn = cp.asarray(prep.obn) + prep.prn = cp.asarray(prep.prn) + # prepare page-locked mems: + ma = self.ma.S[dID].data.astype(np.float32) + prep.ma = cupyx.empty_pinned(ma.shape, ma.dtype, order="C") + prep.ma[:] = ma + ex = self.ex.S[eID].data + prep.ex = cupyx.empty_pinned(ex.shape, ex.dtype, order="C") + prep.ex[:] = ex + mag = prep.mag + prep.mag = cupyx.empty_pinned(mag.shape, mag.dtype, order="C") + prep.mag[:] = mag + + self.ex_data.add_data_block() + self.ma_data.add_data_block() + self.mag_data.add_data_block() + + def engine_iterate(self, num=1): + """ + Compute one iteration. + """ + self.dID_list = list(self.di.S.keys()) + error = {} + for it in range(num): + + for iblock, dID in enumerate(self.dID_list): + + # find probe, object and exit ID in dependence of dID + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + + # references for kernels + kern = self.kernels[prep.label] + FUK = kern.FUK + AWK = kern.AWK + POK = kern.POK + MAK = kern.MAK + PROP = kern.PROP + + # get aux buffer + aux = kern.aux + + # local references + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + + # shuffle view order + vieworder = prep.vieworder + prep.rng.shuffle(vieworder) + + # Schedule ex, ma, mag to device + ev_ex, ex_full, data_ex = self.ex_data.to_gpu( + prep.ex, dID, self.qu_htod) + ev_mag, mag_full, data_mag = self.mag_data.to_gpu( + prep.mag, dID, self.qu_htod) + ev_ma, ma_full, data_ma = self.ma_data.to_gpu( + prep.ma, dID, self.qu_htod) + + # Reference to ex, ma and mag + prep.ex_full = ex_full + prep.mag_full = mag_full + prep.ma_full = ma_full + + # synchronize h2d stream with compute stream + self.queue.wait_event(ev_ex) + + # Iterate through views + for i in vieworder: + + # Get local adress and arrays + addr = prep.addr_gpu[i, None] + ex_from, ex_to = prep.addr_ex[i] + ex = prep.ex_full[ex_from:ex_to] + mag = prep.mag_full[i, None] + ma = prep.ma_full[i, None] + ma_sum = prep.ma_sum_gpu[i, None] + obn = prep.obn + prn = prep.prn + err_phot = prep.err_phot_gpu[i, None] + err_fourier = prep.err_fourier_gpu[i, None] + err_exit = prep.err_exit_gpu[i, None] + + # position update + self.position_update_local(prep, i) + + # build auxilliary wave + AWK.make_aux(aux, addr, ob, pr, ex, + c_po=self._c, c_e=1-self._c) + + # forward FFT + PROP.fw(aux, aux) + + # Deviation from measured data + self.queue.wait_event(ev_mag) + if self.p.compute_fourier_error: + self.queue.wait_event(ev_ma) + FUK.fourier_error(aux, addr, mag, ma, ma_sum) + FUK.error_reduce(addr, err_fourier) + else: + FUK.fourier_deviation(aux, addr, mag) + self.queue.wait_event(ev_ma) + FUK.fmag_update_nopbound(aux, addr, mag, ma) + + # backward FFT + PROP.bw(aux, aux) + + # build exit wave + AWK.make_exit(aux, addr, ob, pr, ex, c_a=self._b, + c_po=self._a, c_e=-(self._a + self._b)) + if self.p.compute_exit_error: + FUK.exit_error(aux, addr) + FUK.error_reduce(addr, err_exit) + + # build auxilliary wave (ob * pr product) + AWK.build_aux2_no_ex(aux, addr, ob, pr) + + # object update + POK.pr_norm_local(addr, pr, prn) + POK.ob_update_local( + addr, ob, pr, ex, aux, prn, a=self._ob_a, b=self._ob_b) + + # probe update + if self._object_norm_is_global and self._pr_a == 0: + obn_max = cp.empty((1,), dtype=np.float32) + MAK.max_abs2(ob, obn_max) + obn.fill(np.float32(0.), stream=self.queue) + else: + POK.ob_norm_local(addr, ob, obn) + obn_max = cp.max(obn, stream=self.queue) + if self.p.probe_update_start <= self.curiter: + POK.pr_update_local( + addr, pr, ob, ex, aux, obn, obn_max, a=self._pr_a, b=self._pr_b) + + # compute log-likelihood + if self.p.compute_log_likelihood: + PROP.fw(aux, aux) + FUK.log_likelihood2(aux, addr, mag, ma, err_phot) + + data_ex.record_done(self.queue, 'compute') + if iblock + len(self.ex_data) < len(self.dID_list): + data_ex.from_gpu(self.qu_dtoh) + + # swap direction + self.dID_list.reverse() + + # Re-center probe + self.center_probe() + + self.curiter += 1 + self.ex_data.syncback = False + + # finish all the compute + self.queue.synchronize() + + for name, s in self.ob.S.items(): + s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + for name, s in self.pr.S.items(): + s.gpu.get_async(stream=self.qu_dtoh, ary=s.data) + + for dID, prep in self.diff_info.items(): + err_fourier = prep.err_fourier_gpu.get() + err_phot = prep.err_phot_gpu.get() + err_exit = prep.err_exit_gpu.get() + errs = np.ascontiguousarray( + np.vstack([err_fourier, err_phot, err_exit]).T) + error.update(zip(prep.view_IDs, errs)) + + # wait for the async transfers + self.qu_dtoh.synchronize() + + self.error = error + return error + + def position_update_local(self, prep, i): + if not self.do_position_refinement: + return + do_update_pos = (self.p.position_refinement.stop > + self.curiter >= self.p.position_refinement.start) + do_update_pos &= (self.curiter % + self.p.position_refinement.interval) == 0 + + # Update positions + if do_update_pos: + """ + Iterates through all positions and refines them by a given algorithm. + """ + #log(4, "----------- START POS REF -------------") + pID, oID, eID = prep.poe_IDs + mag = prep.mag_full[i, None] + ma = prep.ma_full[i, None] + ma_sum = prep.ma_sum_gpu[i, None] + ob = self.ob.S[oID].gpu + pr = self.pr.S[pID].gpu + kern = self.kernels[prep.label] + aux = kern.aux + addr = prep.addr_gpu[i, None] + mangled_addr = prep.mangled_addr_gpu[i, None] + err_fourier = prep.err_fourier_gpu[i, None] + error_state = prep.error_state_gpu[i, None] + + PCK = kern.PCK + PROP = kern.PROP + + # Keep track of object boundaries + max_oby = ob.shape[-2] - aux.shape[-2] - 1 + max_obx = ob.shape[-1] - aux.shape[-1] - 1 + + # We need to re-calculate the current error + PCK.build_aux(aux, addr, ob, pr) + PROP.fw(aux, aux) + # self.queue.wait_event(ev_mag) + # self.queue.wait_event(ev_ma) + + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, addr, mag, ma, ma_sum) + PCK.error_reduce(addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, addr, mag, ma, err_fourier) + cp.cuda.runtime.memcpyAsync(dst=error_state.data.ptr, + src=err_fourier.data.ptr, + size=err_fourier.nbytes, + stream=self.queue.ptr, + kind=3) # d2d + + PCK.mangler.setup_shifts(self.curiter, nframes=addr.shape[0]) + + #log(4, 'Position refinement trial: iteration %s' % (self.curiter)) + for i in range(PCK.mangler.nshifts): + PCK.mangler.get_address( + i, addr, mangled_addr, max_oby, max_obx) + PCK.build_aux(aux, mangled_addr, ob, pr) + PROP.fw(aux, aux) + if self.p.position_refinement.metric == "fourier": + PCK.fourier_error(aux, mangled_addr, mag, ma, ma_sum) + PCK.error_reduce(mangled_addr, err_fourier) + if self.p.position_refinement.metric == "photon": + PCK.log_likelihood(aux, mangled_addr, mag, ma, err_fourier) + PCK.update_addr_and_error_state( + addr, error_state, mangled_addr, err_fourier) + + cp.cuda.runtime.memcpyAsync(dst=err_fourier.data.ptr, + src=error_state.data.ptr, + size=err_fourier.nbytes, + stream=self.queue.ptr, + kind=3) # d2d + + def center_probe(self): + if self.p.probe_center_tol is not None: + for name, pr_s in self.pr.storages.items(): + psum_d = self.A2SK.abs2sum(pr_s.gpu) + c1 = self.MCK.mass_center(psum_d).get() + c2 = (np.asarray(pr_s.shape[-2:]) // 2).astype(c1.dtype) + + shift = c2 - c1 + # exit if the current center of mass is within the tolerance + if u.norm(shift) < self.p.probe_center_tol: + break + + # shift the probe + pr_s.gpu = self.ISK.interpolate_shift(pr_s.gpu, shift) + + # shift the object + ob_s = pr_s.views[0].pod.ob_view.storage + ob_s.gpu = self.ISK.interpolate_shift(ob_s.gpu, shift) + + # shift the exit waves + for dID in self.di.S.keys(): + prep = self.diff_info[dID] + pID, oID, eID = prep.poe_IDs + if pID == name: + prep.ex_full = self.ISK.interpolate_shift(prep.ex_full, + shift) + + log(4, 'Probe recentered from %s to %s' + % (str(tuple(c1)), str(tuple(c2)))) + + def engine_finalize(self): + """ + clear GPU data and destroy context. + """ + self.ex_data = None + self.ma_data = None + self.mag_data = None + + for name, s in self.ob.S.items(): + del s.gpu + for name, s in self.pr.S.items(): + del s.gpu + for dID, prep in self.diff_info.items(): + prep.addr = prep.addr_gpu.get() + + # copy data to cpu + # this kills the pagelock memory (otherwise we get segfaults in h5py) + for name, s in self.pr.S.items(): + s.data = np.copy(s.data) + for name, s in self.ob.S.items(): + s.data = np.copy(s.data) + + self.context.detach() + super().engine_finalize() + + +@register() +class EPIE_cupy(_StochasticEngineCupy, EPIEMixin): + """ + An accelerated implementation of the EPIE algorithm. + + Defaults: + + [name] + default = EPIE_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _StochasticEngineCupy.__init__(self, ptycho_parent, pars) + EPIEMixin.__init__(self, self.p.alpha, self.p.beta) + ptycho_parent.citations.add_article(**self.article) + + +@register() +class SDR_cupy(_StochasticEngineCupy, SDRMixin): + """ + An accelerated implementation of the semi-implicit relaxed Douglas-Rachford (SDR) algorithm. + + Defaults: + + [name] + default = SDR_cupy + type = str + help = + doc = + + """ + + def __init__(self, ptycho_parent, pars=None): + _StochasticEngineCupy.__init__(self, ptycho_parent, pars) + SDRMixin.__init__(self, self.p.sigma, self.p.tau, + self.p.beta_probe, self.p.beta_object) + ptycho_parent.citations.add_article(**self.article) diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py new file mode 100644 index 000000000..53c012076 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/kernels.py @@ -0,0 +1,1345 @@ +import numpy as np +from ..base import kernels as ab +from . import load_kernel +import cupy as cp +from ..base.kernels import Adict +from inspect import getfullargspec +from .array_utils import MaxAbs2Kernel, CropPadKernel +from ptypy.utils.verbose import logger + + +# fourier support +def choose_fft(arr_shape, fft_type=None): + dims_are_powers_of_two = True + rows = arr_shape[0] + columns = arr_shape[1] + if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: + dims_are_powers_of_two = False + if dims_are_powers_of_two: + try: + from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda as FFT + except: + logger.info( + 'Unable to import optimised cufft version - using cufft with separte callbacks instead') + from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT + else: + logger.info( + 'cufft: array dimensions are not powers of two (16 to 2048) - using cufft with separated callbacks') + from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as FFT + return FFT + + +class PropagationKernel: + + def __init__(self, aux, propagator, queue_thread=None, fft_type='cuda'): + self.aux = aux + self._queue = queue_thread + self.prop_type = propagator.p.propagation + self.fw = None + self.bw = None + self._fft1 = None + self._fft2 = None + self._p = propagator + self.fft_type = fft_type + + def allocate(self): + + aux = self.aux + FFT = choose_fft(aux.shape[-2:], self.fft_type) + + if self.prop_type == 'farfield': + + self._do_crop_pad = (self._p.crop_pad != 0).any() + if self._do_crop_pad: + aux_shape = tuple(np.array(aux.shape) + + np.append([0], self._p.crop_pad)) + self._tmp = np.zeros(aux_shape, dtype=aux.dtype) + self._CPK = CropPadKernel(queue=self._queue) + else: + self._tmp = aux + + self._fft1 = FFT(self._tmp, self.queue, + pre_fft=self._p.pre_fft, + post_fft=self._p.post_fft, + symmetric=True, + forward=True) + self._fft2 = FFT(self._tmp, self.queue, + pre_fft=self._p.pre_ifft, + post_fft=self._p.post_ifft, + symmetric=True, + forward=False) + if self._do_crop_pad: + self._tmp = cp.asarray(self._tmp) + + def _fw(x, y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft1.ft(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft1.ft(x, y) + + def _bw(x, y): + if self._do_crop_pad: + self._CPK.crop_pad_2d_simple(self._tmp, x) + self._fft2.ift(self._tmp, self._tmp) + self._CPK.crop_pad_2d_simple(y, self._tmp) + else: + self._fft2.ift(x, y) + + self.fw = _fw + self.bw = _bw + + elif self.prop_type == "nearfield": + self._fft1 = FFT(aux, self.queue, + post_fft=self._p.kernel, + symmetric=True, + forward=True) + self._fft2 = FFT(aux, self.queue, + post_fft=self._p.ikernel, + inplace=True, + symmetric=True, + forward=True) + self._fft3 = FFT(aux, self.queue, + symmetric=True, + forward=False) + + def _fw(x, y): + self._fft1.ft(x, y) + self._fft3.ift(y, y) + + def _bw(x, y): + self._fft2.ft(x, y) + self._fft3.ift(y, y) + + self.fw = _fw + self.bw = _bw + else: + logger.warning( + "Unable to select propagator %s, only nearfield and farfield are supported" % self.prop_type) + + @property + def queue(self): + return self._queue + + @queue.setter + def queue(self, queue): + self._queue = queue + self._fft1.queue = queue + self._fft2.queue = queue + if self.prop_type == "nearfield": + self._fft3.queue = queue + + +class FourierSupportKernel: + def __init__(self, support, queue_thread=None): + self.support = support + self.queue = queue_thread + + def allocate(self): + FFT = choose_fft(self.support.shape[-2:]) + + self._fft1 = FFT(self.support, self.queue, + post_fft=self.support, + symmetric=True, + forward=True) + self._fft2 = FFT(self.support, self.queue, + symmetric=True, + forward=False) + + def apply_fourier_support(self, x): + self._fft1.ft(x, x) + self._fft2.ift(x, x) + + +class RealSupportKernel: + def __init__(self, support, queue=None): + self.queue = queue + self.support = support + + def allocate(self): + if self.queue is not None: + self.queue.use() + self.support = cp.asarray(self.support) + + def apply_real_support(self, x): + if self.queue is not None: + self.queue.use() + x *= self.support + + +class FourierUpdateKernel(ab.FourierUpdateKernel): + + def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'): + super(FourierUpdateKernel, self).__init__(aux, nmodes=nmodes) + + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + if math_type not in ['float', 'double']: + raise ValueError('Only float or double types are supported') + self.accumulate_type = accumulate_type + self.math_type = math_type + self.queue = queue_thread + self.fmag_all_update_cuda = load_kernel("fmag_all_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fmag_update_nopbound_cuda = None + self.fourier_deviation_cuda = None + self.fourier_error_cuda = load_kernel("fourier_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fourier_error2_cuda = None + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'ACC_TYPE': self.accumulate_type, + 'BDIM_X': 32, + 'BDIM_Y': 32, + }) + self.fourier_update_cuda = None + self.log_likelihood_cuda, self.log_likelihood2_cuda = load_kernel( + ("log_likelihood", "log_likelihood2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, + "log_likelihood.cu") + self.exit_error_cuda = load_kernel("exit_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + + self.gpu = Adict() + self.gpu.fdev = None + self.gpu.ferr = None + + def allocate(self): + self.gpu.fdev = cp.zeros(self.fshape, dtype=np.float32) + self.gpu.ferr = cp.zeros(self.fshape, dtype=np.float32) + + def fourier_error(self, f, addr, fmag, fmask, mask_sum): + fdev = self.gpu.fdev + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + if True: + # version going over all modes in a single thread (faster) + self.fourier_error_cuda(grid=(int(fmag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + else: + # version using one thread per mode + shared mem reduction (slower) + if self.fourier_error2_cuda is None: + self.fourier_error2_cuda = load_kernel("fourier_error2") + bx = 16 + by = 16 + bz = int(self.nmodes) + blk = (bx, by, bz) + grd = (int((self.fshape[2] + bx-1) // bx), + int((self.fshape[1] + by-1) // by), + int(self.fshape[0])) + # print('block={}, grid={}, fshape={}'.format(blk, grd, self.fshape)) + self.fourier_error2_cuda(grid=grd, + block=blk, + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2])), + shared_mem=int(bx*by*bz*4)) + + def fourier_deviation(self, f, addr, fmag): + fdev = self.gpu.fdev + if self.fourier_deviation_cuda is None: + self.fourier_deviation_cuda = load_kernel("fourier_deviation", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.fourier_deviation_cuda(grid=( + 1, int((self.fshape[2] + by - 1)//by), int(fmag.shape[0])), + block=(bx, by, 1), + args=(np.int32(self.nmodes), + f, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def error_reduce(self, addr, err_sum): + if self.queue is not None: + self.queue.use() + self.error_reduce_cuda(grid=(int(err_sum.shape[0]), 1, 1), + block=(32, 32, 1), + args=(self.gpu.ferr, + err_sum, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def fmag_all_update(self, f, addr, fmag, fmask, err_fmag, pbound=0.0): + fdev = self.gpu.fdev + if self.queue is not None: + self.queue.use() + self.fmag_all_update_cuda(grid=(int(fmag.shape[0]*self.nmodes), 1, 1), + block=(32, 32, 1), + args=(f, + fmask, + fmag, + fdev, + err_fmag, + addr, + np.float32(pbound), + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def fmag_update_nopbound(self, f, addr, fmag, fmask): + fdev = self.gpu.fdev + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + if self.fmag_update_nopbound_cuda is None: + self.fmag_update_nopbound_cuda = load_kernel("fmag_update_nopbound", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.fmag_update_nopbound_cuda(grid=(1, + int(( + self.fshape[2] + by - 1) // by), + int(fmag.shape[0]*self.nmodes)), + block=(bx, by, 1), + args=(f, + fmask, + fmag, + fdev, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + # Note: this was a test to join the kernels, but it's > 2x slower! + def fourier_update(self, f, addr, fmag, fmask, mask_sum, err_fmag, pbound=0): + if self.fourier_update_cuda is None: + self.fourier_update_cuda = load_kernel("fourier_update") + if self.queue is not None: + self.queue.use() + + fdev = self.gpu.fdev + ferr = self.gpu.ferr + + bx = 16 + by = 16 + bz = int(self.nmodes) + blk = (bx, by, bz) + grd = (int((self.fshape[2] + bx-1) // bx), + int((self.fshape[1] + by-1) // by), + int(self.fshape[0])) + smem = int(bx*by*bz*4) + self.fourier_update_cuda(grid=grd, + block=blk, + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + err_fmag, + np.float32(pbound), + np.int32(self.fshape[1]), + np.int32(self.fshape[2])), + shared_mem=smem) + + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.log_likelihood_cuda(grid=(int(mag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def log_likelihood2(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.log_likelihood2_cuda(grid=( + 1, int((self.fshape[1] + by - 1) // by), int(mag.shape[0])), + block=(bx, by, 1), + args=(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def exit_error(self, aux, addr): + sh = addr.shape + maxz = sh[0] + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.exit_error_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + aux, + ferr, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + +class AuxiliaryWaveKernel(ab.AuxiliaryWaveKernel): + + def __init__(self, queue_thread=None, math_type='float'): + super(AuxiliaryWaveKernel, self).__init__() + # and now initialise the cuda + self.queue = queue_thread + self._ob_shape = None + self._ob_id = None + self.math_type = math_type + if math_type not in ['float', 'double']: + raise ValueError('Only double or float math is supported') + self.make_aux_cuda, self.make_aux2_cuda = load_kernel( + ("make_aux", "make_aux2"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "make_aux.cu") + self.make_exit_cuda = load_kernel("make_exit", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.build_aux_no_ex_cuda, self.build_aux2_no_ex_cuda = load_kernel( + ("build_aux_no_ex", "build_aux2_no_ex"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "build_aux_no_ex.cu") + # self.build_exit_alpha_tau_cuda = load_kernel("build_exit_alpha_tau", { + # 'IN_TYPE': 'float', + # 'OUT_TYPE': 'float', + # 'MATH_TYPE': self.math_type + # }) + + # DEPRECATED? + def load(self, aux, ob, pr, ex, addr): + super(AuxiliaryWaveKernel, self).load(aux, ob, pr, ex, addr) + for key, array in self.npy.__dict__.items(): + self.ocl.__dict__[key] = cp.to_gpu(array) + + def make_aux(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.make_aux_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32( + c_po) if ex.dtype == np.complex64 else np.float64(c_po), + np.float32(c_e) if ex.dtype == np.complex64 else np.float64(c_e))) + + def make_aux2(self, b_aux, addr, ob, pr, ex, c_po=1.0, c_e=0.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.make_aux2_cuda(grid=(1, + int((ex.shape[1] + by - 1)//by), + int(maxz * nmodes)), + block=(bx, by, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32( + c_po) if ex.dtype == np.complex64 else np.float64(c_po), + np.float32( + c_e) if ex.dtype == np.complex64 else np.float64(c_e))) + + def make_exit(self, b_aux, addr, ob, pr, ex, c_a=1.0, c_po=0.0, c_e=-1.0): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.make_exit_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32( + c_a) if ex.dtype == np.complex64 else np.float64(c_a), + np.float32( + c_po) if ex.dtype == np.complex64 else np.float64(c_po), + np.float32( + c_e) if ex.dtype == np.complex64 else np.float64(c_e))) + + def build_aux2(self, b_aux, addr, ob, pr, ex, alpha=1.0): + # DM only, legacy. also make_aux2 does no exit in the parent + self.make_aux2(b_aux, addr, ob, pr, ex, 1.+alpha, -alpha) + + """ + def build_exit_alpha_tau(self, b_aux, addr, ob, pr, ex, alpha=1, tau=1): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.build_exit_alpha_tau_cuda(grid=(1, int((ex.shape[1] + by - 1) // by), int(maxz * nmodes)), + block=(bx, by, 1), + args=(b_aux, + ex, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + pr, + np.int32(ex.shape[1]), np.int32(ex.shape[2]), + ob, + obr, obc, + addr, + np.float32(alpha), np.float32(tau))) + """ + + def build_aux_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.build_aux_no_ex_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + np.int32(b_aux.shape[-2]), + np.int32(b_aux.shape[-1]), + pr, + np.int32(pr.shape[-2]), + np.int32(pr.shape[-1]), + ob, + obr, obc, + addr, + np.float32( + fac) if pr.dtype == np.complex64 else np.float64(fac), + np.int32(add))) + + def build_aux2_no_ex(self, b_aux, addr, ob, pr, fac=1.0, add=False): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.build_aux2_no_ex_cuda(grid=(1, int((b_aux.shape[-2] + by - 1)//by), int(maxz * nmodes)), + block=(bx, by, 1), + args=(b_aux, + np.int32(b_aux.shape[-2]), + np.int32(b_aux.shape[-1]), + pr, + np.int32(pr.shape[-2]), + np.int32(pr.shape[-1]), + ob, + obr, obc, + addr, + np.float32( + fac) if pr.dtype == np.complex64 else np.float64(fac), + np.int32(add))) + + def _cache_object_shape(self, ob): + oid = id(ob) + + if not oid == self._ob_id: + self._ob_id = oid + self._ob_shape = (np.int32(ob.shape[-2]), np.int32(ob.shape[-1])) + + return self._ob_shape + + +class GradientDescentKernel(ab.GradientDescentKernel): + + def __init__(self, aux, nmodes=1, queue=None, accumulate_type='double', math_type='float'): + super().__init__(aux, nmodes) + self.queue = queue + self.accumulate_type = accumulate_type + self.math_type = math_type + if (accumulate_type not in ['double', 'float']) or (math_type not in ['double', 'float']): + raise ValueError( + "accumulate and math types must be double for float") + + self.gpu = Adict() + self.gpu.LLden = None + self.gpu.LLerr = None + self.gpu.Imodel = None + + subs = { + 'IN_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'ACC_TYPE': self.accumulate_type, + 'MATH_TYPE': self.math_type + } + self.make_model_cuda = load_kernel('make_model', subs) + self.make_a012_cuda = load_kernel('make_a012', subs) + self.error_reduce_cuda = load_kernel('error_reduce', { + **subs, + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double', + 'BDIM_X': 32, + 'BDIM_Y': 32 + }) + self.fill_b_cuda, self.fill_b_reduce_cuda = load_kernel( + ('fill_b', 'fill_b_reduce'), + { + **subs, + 'BDIM_X': 1024, + 'OUT_TYPE': 'float' if self.ftype == np.float32 else 'double' + }, + file="fill_b.cu") + self.main_cuda = load_kernel('gd_main', subs) + self.floating_intensity_cuda_step1, self.floating_intensity_cuda_step2 = \ + load_kernel(('step1', 'step2'), subs, 'intens_renorm.cu') + + def allocate(self): + self.gpu.LLden = cp.zeros(self.fshape, dtype=self.ftype) + self.gpu.LLerr = cp.zeros(self.fshape, dtype=self.ftype) + self.gpu.Imodel = cp.zeros(self.fshape, dtype=self.ftype) + tmp = np.ones((self.fshape[0],), dtype=self.ftype) + self.gpu.fic_tmp = cp.asarray(tmp) + + # temporary array for the reduction in fill_b + sh = (3, int((np.prod(self.fshape)*self.nmodes + 1023) // 1024)) + self.gpu.Btmp = cp.zeros( + sh, dtype=np.float64 if self.accumulate_type == 'double' else np.float32) + + def make_model(self, b_aux, addr): + # reference shape + sh = self.fshape + + # batch buffers + Imodel = self.gpu.Imodel + aux = b_aux + + # dimensions / grid + z = np.int32(sh[0]) + y = np.int32(self.nmodes) + x = np.int32(sh[1] * sh[2]) + bx = 1024 + if self.queue is not None: + self.queue.use() + self.make_model_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)), + block=(bx, 1, 1), + args=(aux, Imodel, z, y, x)) + + def make_a012(self, b_f, b_a, b_b, addr, I, fic): + # reference shape (= GPU global dims) + sh = I.shape + + # stopper + maxz = I.shape[0] + + A0 = self.gpu.Imodel + A1 = self.gpu.LLerr + A2 = self.gpu.LLden + + z = np.int32(sh[0]) + maxz = np.int32(maxz) + y = np.int32(self.nmodes) + x = np.int32(sh[1]*sh[2]) + bx = 1024 + if self.queue is not None: + self.queue.use() + self.make_a012_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)), + block=(bx, 1, 1), + args=(b_f, b_a, b_b, I, fic, + A0, A1, A2, z, y, x, maxz)) + + def fill_b(self, addr, Brenorm, w, B): + # stopper + maxz = w.shape[0] + + A0 = self.gpu.Imodel + A1 = self.gpu.LLerr + A2 = self.gpu.LLden + + sz = np.int32(np.prod(w.shape)) + blks = int((sz + 1023) // 1024) + # print('blocks={}, Btmp={}, fshape={}, wshape={}, modes={}'.format(blks, self.gpu.Btmp.shape, self.fshape, w.shape, self.nmodes)) + assert self.gpu.Btmp.shape[1] >= blks + # 2-stage reduction - even if 1 block, as we have a += in second kernel + if self.queue is not None: + self.queue.use() + self.fill_b_cuda(grid=(blks, 1, 1), + block=(1024, 1, 1), + args=(A0, A1, A2, w, + np.float32(Brenorm) if self.ftype == np.float32 else np.float64( + Brenorm), + sz, self.gpu.Btmp)) + self.fill_b_reduce_cuda(grid=(1, 1, 1), + block=(1024, 1, 1), + args=(self.gpu.Btmp, B, np.int32(blks))) + + def error_reduce(self, addr, err_sum): + # reference shape (= GPU global dims) + sh = err_sum.shape + + # stopper + maxz = err_sum.shape[0] + + # batch buffers + ferr = self.gpu.LLerr + + # print('maxz={}, ferr={}'.format(maxz, ferr.shape)) + assert (maxz <= np.prod(ferr.shape[:-2])) + + if self.queue is not None: + self.queue.use() + + # Reduces the LL error along the last 2 dimensions.fd + self.error_reduce_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(ferr, err_sum, + np.int32(ferr.shape[-2]), + np.int32(ferr.shape[-1]))) + + def floating_intensity(self, addr, w, I, fic): + + # reference shape (= GPU global dims) + sh = I.shape + + # stopper + maxz = I.shape[0] + + # internal buffers + num = self.gpu.LLerr + den = self.gpu.LLden + Imodel = self.gpu.Imodel + fic_tmp = self.gpu.fic_tmp + + ## math ## + xall = np.int32(maxz * sh[1] * sh[2]) + bx = 1024 + + if self.queue is not None: + self.queue.use() + + self.floating_intensity_cuda_step1(grid=(int((xall + bx - 1) // bx), 1, 1), + block=(bx, 1, 1), + args=(Imodel, I, w, num, den, + xall)) + + self.error_reduce_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(num, fic, + np.int32(num.shape[-2]), + np.int32(num.shape[-1]))) + + self.error_reduce_cuda(grid=(int(maxz), 1, 1), + block=(32, 32, 1), + args=(den, fic_tmp, + np.int32(den.shape[-2]), + np.int32(den.shape[-1]))) + + self.floating_intensity_cuda_step2(grid=(1, 1, int(maxz)), + block=(32, 32, 1), + args=(fic_tmp, fic, Imodel, + np.int32(Imodel.shape[-2]), + np.int32(Imodel.shape[-1]))) + + def main(self, b_aux, addr, w, I): + nmodes = self.nmodes + # stopper + maxz = I.shape[0] + + # batch buffers + err = self.gpu.LLerr + Imodel = self.gpu.Imodel + aux = b_aux + + # write-to shape (= GPU global dims) + ish = aux.shape + + x = np.int32(ish[1] * ish[2]) + y = np.int32(nmodes) + z = np.int32(maxz) + bx = 1024 + + if self.queue is not None: + self.queue.use() + + # print(Imodel.dtype, I.dtype, w.dtype, err.dtype, aux.dtype, z, y, x) + self.main_cuda(grid=(int((x + bx - 1) // bx), 1, int(z)), + block=(bx, 1, 1), + args=(Imodel, I, w, err, aux, + z, y, x)) + + +class PoUpdateKernel(ab.PoUpdateKernel): + + def __init__(self, queue_thread=None, + math_type='float', accumulator_type='float'): + super(PoUpdateKernel, self).__init__() + # and now initialise the cuda + if math_type not in ['double', 'float']: + raise ValueError( + 'only float and double are supported for math_type') + if accumulator_type not in ['double', 'float']: + raise ValueError( + 'only float and double are supported for accumulator_type') + self.math_type = math_type + self.accumulator_type = accumulator_type + self.queue = queue_thread + self.norm = None + self.MAK = MaxAbs2Kernel(self.queue) + self.ob_update_cuda = load_kernel("ob_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.ob_update2_cuda = None # load_kernel("ob_update2") + self.pr_update_cuda = load_kernel("pr_update", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.pr_update2_cuda = None + self.ob_update_ML_cuda = load_kernel("ob_update_ML", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.ob_update2_ML_cuda = None + self.pr_update_ML_cuda = load_kernel("pr_update_ML", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.pr_update2_ML_cuda = None + self.ob_update_local_cuda = load_kernel("ob_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_update_local_cuda = load_kernel("pr_update_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.ob_norm_local_cuda = load_kernel("ob_norm_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + self.pr_norm_local_cuda = load_kernel("pr_norm_local", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + def ob_update(self, addr, ob, obn, pr, ex, atomics=True): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + if obn.dtype != np.float32: + raise ValueError( + "Denominator must be float32 in current implementation") + + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for atomics ob_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.ob_update_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, prsh[1], prsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + obn)) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled ob_update') + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.ob_update2_cuda: + self.ob_update2_cuda = load_kernel("ob_update2", { + "NUM_MODES": obsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + grid = [int((x+15)//16) for x in ob.shape[-2:]] + grid = (grid[1], grid[0], int(1)) + self.ob_update2_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], + prsh[0], + np.int32(ex.shape[0]), + np.int32(ex.shape[1]), + np.int32(ex.shape[2]), + ob, obn, pr, ex, addr)) + + def pr_update(self, addr, pr, prn, ob, ex, atomics=True): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + if prn.dtype != np.float32: + raise ValueError( + "Denominator must be float32 in current implementation") + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for atomics pr_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.pr_update_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, prsh[1], prsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + prn)) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled pr_update') + + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.pr_update2_cuda: + self.pr_update2_cuda = load_kernel("pr_update2", { + "NUM_MODES": prsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + grid = [int((x+15)//16) for x in pr.shape[-2:]] + grid = (grid[0], grid[1], int(1)) + self.pr_update2_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[-2], obsh[-1], + prsh[0], obsh[0], num_pods, + pr, prn, ob, ex, addr)) + + def ob_update_ML(self, addr, ob, pr, ex, fac=2.0, atomics=True): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.ob_update_ML_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, exsh[1], exsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled ob_update') + + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.ob_update2_ML_cuda: + self.ob_update2_ML_cuda = load_kernel("ob_update2_ML", { + "NUM_MODES": obsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + grid = [int((x+15)//16) for x in ob.shape[-2:]] + grid = (grid[1], grid[0], int(1)) + self.ob_update2_ML_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[0], num_pods, obsh[-2], obsh[-1], + prsh[0], + np.int32(ex.shape[0]), + np.int32(ex.shape[1]), + np.int32(ex.shape[2]), + ob, pr, ex, addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + + def pr_update_ML(self, addr, pr, ob, ex, fac=2.0, atomics=False): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + if self.queue is not None: + self.queue.use() + if atomics: + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + self.pr_update_ML_cuda(grid=(int(num_pods), 1, 1), + block=(32, 32, 1), + args=(ex, num_pods, prsh[1], prsh[2], + pr, prsh[0], prsh[1], prsh[2], + ob, obsh[0], obsh[1], obsh[2], + addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + else: + if addr.shape[0] != 5 or addr.shape[1] != 3: + raise ValueError( + 'Address not in required shape for tiled pr_update') + num_pods = np.int32(addr.shape[2] * addr.shape[3]) + if not self.pr_update2_ML_cuda: + self.pr_update2_ML_cuda = load_kernel("pr_update2_ML", { + "NUM_MODES": prsh[0], + "BDIM_X": 16, + "BDIM_Y": 16, + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type, + 'ACC_TYPE': self.accumulator_type + }) + + grid = [int((x+15)//16) for x in pr.shape[-2:]] + grid = (grid[0], grid[1], int(1)) + self.pr_update2_ML_cuda(grid=grid, + block=(16, 16, 1), + args=(prsh[-1], obsh[-2], obsh[-1], + prsh[0], obsh[0], num_pods, + pr, ob, ex, addr, + np.float32( + fac) if ex.dtype == np.complex64 else np.float64(fac))) + + def ob_update_local(self, addr, ob, pr, ex, aux, prn, a=0., b=1.): + if self.queue is not None: + self.queue.use() + prn_max = cp.max(prn) + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled ob_update') + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.ob_update_local_cuda(grid=( + 1, int((exsh[1] + by - 1)//by), int(num_pods)), + block=(bx, by, 1), + args=(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + prn, + ob, + obsh[0], obsh[1], obsh[2], + addr, + prn_max, + np.float32(a), + np.float32(b))) + + def pr_update_local(self, addr, pr, ob, ex, aux, obn, obn_max, a=0., b=1.): + obsh = [np.int32(ax) for ax in ob.shape] + prsh = [np.int32(ax) for ax in pr.shape] + exsh = [np.int32(ax) for ax in ex.shape] + # atomics version only + if addr.shape[3] != 3 or addr.shape[2] != 5: + raise ValueError( + 'Address not in required shape for tiled pr_update') + if self.queue is not None: + self.queue.use() + num_pods = np.int32(addr.shape[0] * addr.shape[1]) + bx = 64 + by = 1 + self.pr_update_local_cuda(grid=( + 1, int((exsh[1] + by - 1) // by), int(num_pods)), + block=(bx, by, 1), + args=(ex, aux, + exsh[0], exsh[1], exsh[2], + pr, + prsh[0], prsh[1], prsh[2], + obn, + ob, + obsh[0], obsh[1], obsh[2], + addr, + obn_max, + np.float32(a), + np.float32(b))) + + def ob_norm_local(self, addr, ob, obn): + obsh = [np.int32(ax) for ax in ob.shape] + obnsh = [np.int32(ax) for ax in obn.shape] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.ob_norm_local_cuda(grid=( + 1, int((obnsh[1] + by - 1)//by), int(obnsh[0])), + block=(bx, by, 1), + args=(obn, + obnsh[0], obnsh[1], obnsh[2], + ob, + obsh[0], obsh[1], obsh[2], + addr)) + + def pr_norm_local(self, addr, pr, prn): + prsh = [np.int32(ax) for ax in pr.shape] + prnsh = [np.int32(ax) for ax in prn.shape] + bx = 64 + by = 1 + if self.queue is not None: + self.queue.use() + self.pr_norm_local_cuda(grid=( + 1, int((prnsh[1] + by - 1)//by), int(prnsh[0])), + block=(bx, by, 1), + args=(prn, + prnsh[0], prnsh[1], prnsh[2], + pr, + prsh[0], prsh[1], prsh[2], + addr)) + + +class PositionCorrectionKernel(ab.PositionCorrectionKernel): + from ptypy.accelerate.cuda_cupy import address_manglers + + # these are used by the self.setup method - replacing them with the GPU implementation + MANGLERS = { + 'Annealing': address_manglers.RandomIntMangler, + 'GridSearch': address_manglers.GridSearchMangler + } + + def __init__(self, *args, queue_thread=None, math_type='float', accumulate_type='float', **kwargs): + super(PositionCorrectionKernel, self).__init__(*args, **kwargs) + # make sure we set the right stream in the mangler + self.mangler.queue = queue_thread + if math_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + if accumulate_type not in ['float', 'double']: + raise ValueError('Only float or double math is supported') + + # add kernels + self.math_type = math_type + self.accumulate_type = accumulate_type + self.queue = queue_thread + self._ob_shape = None + self._ob_id = None + self.fourier_error_cuda = load_kernel("fourier_error", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.error_reduce_cuda = load_kernel("error_reduce", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'BDIM_X': 32, + 'BDIM_Y': 32, + 'ACC_TYPE': self.accumulate_type + }) + self.log_likelihood_cuda, self.log_likelihood_ml_cuda = load_kernel( + ("log_likelihood", "log_likelihood_ml"), { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }, "log_likelihood.cu") + self.build_aux_pc_cuda = load_kernel("build_aux_position_correction", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float', + 'MATH_TYPE': self.math_type + }) + self.update_addr_and_error_state_cuda = load_kernel("update_addr_error_state", { + 'IN_TYPE': 'float', + 'OUT_TYPE': 'float' + }) + + self.gpu = Adict() + self.gpu.fdev = None + self.gpu.ferr = None + + def allocate(self): + self.gpu.fdev = cp.zeros(self.fshape, dtype=np.float32) + self.gpu.ferr = cp.zeros(self.fshape, dtype=np.float32) + + def build_aux(self, b_aux, addr, ob, pr): + obr, obc = self._cache_object_shape(ob) + sh = addr.shape + nmodes = sh[1] + maxz = sh[0] + if self.queue is not None: + self.queue.use() + self.build_aux_pc_cuda(grid=(int(maxz * nmodes), 1, 1), + block=(32, 32, 1), + args=(b_aux, + pr, + np.int32(pr.shape[1]), np.int32( + pr.shape[2]), + ob, + obr, obc, + addr)) + + def fourier_error(self, f, addr, fmag, fmask, mask_sum): + fdev = self.gpu.fdev + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.fourier_error_cuda(grid=(int(fmag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + f, + fmask, + fmag, + fdev, + ferr, + mask_sum, + addr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def error_reduce(self, addr, err_fmag): + # import sys + # float_size = sys.getsizeof(np.float32(4)) + # shared_memory_size =int(2 * 32 * 32 *float_size) # this doesn't work even though its the same... + # shared_memory_size = int(49152) + if self.queue is not None: + self.queue.use() + self.error_reduce_cuda(grid=(int(err_fmag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(self.gpu.ferr, + err_fmag, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + + def log_likelihood(self, b_aux, addr, mag, mask, err_phot): + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.log_likelihood_cuda(grid=(int(mag.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + b_aux, + mask, + mag, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def log_likelihood_ml(self, b_aux, addr, I, weights, err_phot): + ferr = self.gpu.ferr + if self.queue is not None: + self.queue.use() + self.log_likelihood_ml_cuda(grid=(int(I.shape[0]), 1, 1), + block=(32, 32, 1), + args=(np.int32(self.nmodes), + b_aux, + weights, + I, + addr, + ferr, + np.int32(self.fshape[1]), + np.int32(self.fshape[2]))) + # TODO: we might want to move this call outside of here + self.error_reduce(addr, err_phot) + + def update_addr_and_error_state(self, addr, error_state, mangled_addr, err_sum): + # assume all data is on GPU! + if self.queue is not None: + self.queue.use() + self.update_addr_and_error_state_cuda(grid=( + 1, int((err_sum.shape[0] + 1) // 2), 1), + block=(32, 2, 1), + args=(addr, mangled_addr, error_state, err_sum, + np.int32(addr.shape[1]))) + + def _cache_object_shape(self, ob): + oid = id(ob) + + if not oid == self._ob_id: + self._ob_id = oid + self._ob_shape = (np.int32(ob.shape[-2]), np.int32(ob.shape[-1])) + + return self._ob_shape diff --git a/ptypy/accelerate/cuda_cupy/mem_utils.py b/ptypy/accelerate/cuda_cupy/mem_utils.py new file mode 100644 index 000000000..a92a9657b --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/mem_utils.py @@ -0,0 +1,319 @@ +import numpy as np +import cupy as cp +import cupyx +from collections import deque + + +def make_pagelocked_paired_arrays(ar): + mem = cupyx.empty_pinned(ar.shape, ar.dtype, order="C") + mem[:] = ar + return cp.asarray(mem), mem + + +class GpuData: + """ + Manages one block of GPU data with corresponding CPU data. + Keeps track of which cpu array is currently on GPU by its id, + and transfers if it's not already there. + + To be used for the exit wave, ma, and mag arrays. + Note: Allocator should be pooled for best performance + """ + + def __init__(self, nbytes, syncback=False): + """ + New instance of GpuData. Allocates the GPU-side array. + + :param nbytes: Number of bytes held by this instance. + :param syncback: Should the data be synced back to CPU any time it's swapped out + """ + + self.gpu = None + self.gpuraw = cp.cuda.alloc(nbytes) + self.nbytes = nbytes + self.nbytes_buffer = nbytes + self.gpuId = None + self.cpu = None + self.syncback = syncback + self.ev_done = None + + def _allocator(self, nbytes): + if nbytes > self.nbytes: + raise Exception('requested more bytes than maximum given before: {} vs {}'.format( + nbytes, self.nbytes)) + return self.gpuraw + + def record_done(self, stream): + self.ev_done = cp.cuda.Event() + with stream: + self.ev_done.record() + + def to_gpu(self, cpu, id, stream): + """ + Transfer cpu array to GPU on stream (async), keeping track of its id + """ + if self.gpuId != id: + if self.syncback: + self.from_gpu(stream) + self.gpuId = id + self.cpu = cpu + if self.ev_done is not None: + self.ev_done.synchronize() + alloc = cp.cuda.get_allocator() + try: + cp.cuda.set_allocator(self._allocator) + with stream: + self.gpu = cp.asarray(cpu) + finally: + cp.cuda.set_allocator(alloc) + return self.gpu + + def from_gpu(self, stream): + """ + Transfer data back to CPU, into same data handle it was copied from + before. + """ + if self.cpu is not None and self.gpuId is not None and self.gpu is not None: + if self.ev_done is not None: + stream.wait_event(self.ev_done) + cp.cuda.runtime.memcpyAsync(dst=self.cpu.ctypes.data, + src=self.gpu.data.ptr, + size=self.gpu.nbytes, + kind=2, # d2h + stream=stream.ptr) + self.ev_done = cp.cuda.Event() + self.ev_done.record(stream) + + def resize(self, nbytes): + """ + Resize the size of the underlying buffer, to allow re-use in different contexts. + Note that memory will only be freed/reallocated if the new number of bytes are + either larger than before, or if they are less than 90% of the original size - + otherwise it reuses the existing buffer + """ + if nbytes > self.nbytes_buffer or nbytes < self.nbytes_buffer * .9: + self.nbytes_buffer = nbytes + self.gpuraw.mem.free() + self.gpuraw = cp.cuda.alloc(nbytes) + + self.nbytes = nbytes + self.reset() + + def reset(self): + """ + Resets handles of cpu references and ids, so that all data will be transfered + again even if IDs match. + """ + self.gpuId = None + self.cpu = None + self.ev_done = None + + def free(self): + """ + Free the underlying buffer on GPU - this object should not be used afterwards + """ + self.gpuraw.mem.free() + self.gpuraw = None + + +class GpuData2(GpuData): + """ + Manages one block of GPU data with corresponding CPU data. + Keeps track of which cpu array is currently on GPU by its id, + and transfers if it's not already there. + + To be used for the exit wave, ma, and mag arrays. + Note: Allocator should be pooled for best performance + """ + + def __init__(self, nbytes, syncback=False): + """ + New instance of GpuData. Allocates the GPU-side array. + + :param nbytes: Number of bytes held by this instance. + :param syncback: Should the data be synced back to CPU any time it's swapped out + """ + self.done_what = None + super().__init__(nbytes, syncback) + + def record_done(self, stream, what): + assert what in ['dtoh', 'htod', 'compute'] + self.ev_done = cp.cuda.Event() + with stream: + self.ev_done.record() + self.done_what = what + + def to_gpu(self, cpu, ident, stream): + """ + Transfer cpu array to GPU on stream (async), keeping track of its id + """ + ident = id(cpu) if ident is None else ident + if self.gpuId != ident: + if self.ev_done is not None: + stream.wait_event(self.ev_done) + # Safety measure. This is asynchronous, but it should still work + # Essentially we want to copy the data held in gpu array back to its CPU + # handle before the buffer can be reused. + if self.done_what != 'dtoh' and self.syncback: + # uploads on the download stream, easy to spot in nsight-sys + self.from_gpu(stream) + self.gpuId = ident + self.cpu = cpu + alloc = cp.cuda.get_allocator() + try: + cp.cuda.set_allocator(self._allocator) + with stream: + self.gpu = cp.asarray(cpu) + finally: + cp.cuda.set_allocator(alloc) + self.record_done(stream, 'htod') + return self.ev_done, self.gpu + + def from_gpu(self, stream): + """ + Transfer data back to CPU, into same data handle it was copied from + before. + """ + if self.cpu is not None and self.gpuId is not None and self.gpu is not None: + # Wait for any action recorded with this array + if self.ev_done is not None: + stream.wait_event(self.ev_done) + cp.cuda.runtime.memcpyAsync(dst=self.cpu.ctypes.data, + src=self.gpu.data.ptr, + size=self.gpu.nbytes, + kind=2, # d2h + stream=stream.ptr) + self.record_done(stream, 'dtoh') + # Mark for reuse + self.gpuId = None + return self.ev_done + else: + return None + + +class GpuDataManager: + """ + Manages a set of GpuData instances, to keep several blocks on device. + + Currently all blocks must be the same size. + + Note that the syncback property is used so that during fourier updates, + the exit wave array is synced bck to cpu (it is updated), + while during probe update, it's not. + """ + + def __init__(self, nbytes, num, max=None, syncback=False): + """ + Create an instance of GpuDataManager. + Parameters are the same as for GpuData, and num is the number of + GpuData instances to create (blocks on device). + """ + self._syncback = syncback + self._nbytes = nbytes + self.data = [] + self.max = max + for i in range(num): + self.add_data_block() + + def add_data_block(self, nbytes=None): + """ + Add a GpuData block. + + Parameters + ---------- + nbytes - Size of block + + Returns + ------- + """ + if self.max is None or len(self) < self.max: + nbytes = nbytes if nbytes is not None else self._nbytes + self.data.append(GpuData2(nbytes, self._syncback)) + + @property + def syncback(self): + """ + Get if syncback of data to CPU on swapout is enabled. + """ + return self._syncback + + @syncback.setter + def syncback(self, whether): + """ + Adjust the syncback setting + """ + self._syncback = whether + for d in self.data: + d.syncback = whether + + @property + def nbytes(self): + """ + Get the number of bytes in each block + """ + return self.data[0].nbytes + + @property + def memory(self): + """ + Get all memory occupied by all blocks + """ + m = 0 + for d in self.data: + m += d.nbytes_buffer + return m + + def __len__(self): + return len(self.data) + + def reset(self, nbytes, num): + """ + Reset this object as if these parameters were given to the constructor. + The syncback property is untouched. + """ + sync = self.syncback + # remove if too many, explictly freeing memory + for i in range(num, len(self.data)): + self.data[i].free() + # cut short if too many + self.data = self.data[:num] + # reset existing + for d in self.data: + d.resize(nbytes) + # append new ones + for i in range(len(self.data), num): + self.data.append(GpuData2(nbytes, sync)) + + def free(self): + """ + Explicitly clear all data blocks - same as resetting to 0 blocks + """ + self.reset(0, 0) + + def to_gpu(self, cpu, id, stream, pop_id="none"): + """ + Transfer a block to the GPU, given its ID and CPU data array + """ + idx = 0 + for x in self.data: + if x.gpuId == id or x.gpuId == pop_id: + break + idx += 1 + if idx == len(self.data): + idx = 0 + else: + pass + m = self.data.pop(idx) + self.data.append(m) + #print("Swap %s for %s and move from %d to %d" % (m.gpuId,id,idx,len(self.data))) + ev, gpu = m.to_gpu(cpu, id, stream) + # return the wait event, the gpu array and the function to register a finished computation + return ev, gpu, m + + def sync_to_cpu(self, stream): + """ + Sync back all data to CPU + """ + for x in self.data: + x.from_gpu(stream) + diff --git a/ptypy/accelerate/cuda_cupy/multi_gpu.py b/ptypy/accelerate/cuda_cupy/multi_gpu.py new file mode 100644 index 000000000..79f511423 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/multi_gpu.py @@ -0,0 +1,151 @@ +""" +Multi-GPU AllReduce Wrapper, that uses NCCL via cupy if it's available, +and otherwise falls back to CUDA-aware MPI, +and if that doesn't work, uses host/device copies with regular MPI. + +Findings: + +1) OpenMPI with CUDA support needs to be available, and: + - mpi4py needs to be compiled from master (3.1.0a - latest stable release 3.0.x doesn't have it) + - OpenMPI in a conda install needs to have the environment variable + --> if cuda support isn't enabled, the application simply crashes with a seg fault + +2) For NCCL peer-to-peer transfers, the EXCLUSIVE compute mode cannot be used. + It should be in DEFAULT mode. + +""" + +from pkg_resources import parse_version +import numpy as np +import cupy as cp +from ptypy.utils import parallel +from ptypy.utils.verbose import logger, log +import os +from cupy.cuda import nccl + +try: + import mpi4py +except ImportError: + mpi4py = None + +# properties to check which versions are available + +# use NCCL if it is available, and the user didn't override the +# default selection with environment variables +have_nccl = (not 'PTYPY_USE_CUDAMPI' in os.environ) and \ + (not 'PTYPY_USE_MPI' in os.environ) + +# At the moment, we require: +# the OpenMPI env var OMPI_MCA_opal_cuda_support to be set to true, +# mpi4py >= 3.1.0 +# and not setting the PTYPY_USE_MPI environment variable +# +# -> we ideally want to allow enabling support from a parameter in ptypy +have_cuda_mpi = (mpi4py is not None) and \ + "OMPI_MCA_opal_cuda_support" in os.environ and \ + os.environ["OMPI_MCA_opal_cuda_support"] == "true" and \ + parse_version(parse_version(mpi4py.__version__).base_version) >= parse_version("3.1.0") and \ + not ('PTYPY_USE_MPI' in os.environ) + + +class MultiGpuCommunicatorBase: + """Base class for multi-GPU communicator options, to aggregate common bits""" + + def __init__(self): + self.rank = parallel.rank + self.ndev = parallel.size + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + # base class only checks properties of arrays + assert isinstance(arr, cp.ndarray), "Input must be a GPU Array" + + +class MultiGpuCommunicatorMpi(MultiGpuCommunicatorBase): + """Communicator for AllReduce that uses MPI on the CPU, i.e. D2H, allreduce, H2D""" + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + super().allReduceSum(arr) + + if parallel.MPIenabled: + # note: this creates a temporary CPU array + data = arr.get() + parallel.allreduce(data) + arr.set(data) + + +class MultiGpuCommunicatorCudaMpi(MultiGpuCommunicatorBase): + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + if parallel.MPIenabled: + comm = parallel.comm + comm.Allreduce(parallel.MPI.IN_PLACE, arr) + + +class MultiGpuCommunicatorNccl(MultiGpuCommunicatorBase): + + def __init__(self): + super().__init__() + + # Check if GPUs are in default mode + if cp.cuda.Device().attributes["ComputeMode"] != 0: ## ComputeModeDefault + raise RuntimeError( + "Compute mode must be default in order to use NCCL") + + # get a unique identifier for the NCCL communicator and + # broadcast it to all MPI processes (assuming one device per process) + if self.rank == 0: + self.id = nccl.get_unique_id() + else: + self.id = None + + self.id = parallel.bcast(self.id) + + self.com = nccl.NcclCommunicator(self.ndev, self.id, self.rank) + + def allReduceSum(self, arr): + """Call MPI.all_reduce in-place, with array on GPU""" + + count, datatype = self.__get_NCCL_count_dtype(arr) + + self.com.allReduce(arr.data.ptr, arr.data.ptr, count, datatype, nccl.NCCL_SUM, + cp.cuda.get_current_stream().ptr) + + def __get_NCCL_count_dtype(self, arr): + if arr.dtype == np.complex64: + return arr.size*2, nccl.NCCL_FLOAT32 + elif arr.dtype == np.complex128: + return arr.size*2, nccl.NCCL_FLOAT64 + elif arr.dtype == np.float32: + return arr.size, nccl.NCCL_FLOAT32 + elif arr.dtype == np.float64: + return arr.size, nccl.NCCL_FLOAT64 + else: + raise ValueError("This dtype is not supported by NCCL.") + + +# pick the appropriate communicator depending on installed packages +def get_multi_gpu_communicator(use_nccl=True, use_cuda_mpi=True): + if have_nccl and use_nccl: + try: + comm = MultiGpuCommunicatorNccl() + log(4, "Using NCCL communicator") + return comm + except RuntimeError: + pass + except AttributeError: + # see issue #323 + pass + if have_cuda_mpi and use_cuda_mpi: + try: + comm = MultiGpuCommunicatorCudaMpi() + log(4, "Using CUDA-aware MPI communicator") + return comm + except RuntimeError: + pass + comm = MultiGpuCommunicatorMpi() + log(4, "Using MPI communicator") + return comm diff --git a/ptypy/accelerate/cuda_cupy/porting_notes.md b/ptypy/accelerate/cuda_cupy/porting_notes.md new file mode 100644 index 000000000..a10869492 --- /dev/null +++ b/ptypy/accelerate/cuda_cupy/porting_notes.md @@ -0,0 +1,60 @@ +# PyCUDA to CuPy Porting Notes + +This file collects notes for things to consider and issues that were fixed when +porting the pycuda code to cupy. + +## Simple Conversions + +- `gpuarray.to_gpu` => `cp.asarray` +- `gpuarray.zeros`, etc, typically have cupy equivalents in `cp.` +- `gpuarray.get` generally works with `cp.get` as well, but cupy has a more descriptive `cp.asnumpy` as well +- all functions that don't have a direct numpy equivalent are in `cupyx` rather than `cupy` + (for example for pinned arrays) +- raw data pointers to GPU arrays can be retrieved with `x.data.ptr` +- raw data pointers to streams: `stream.ptr` +- low-level APIs, are closer to the standard CUDA runtime calls and are in `cupy.cuda.runtime` module, for example `memcpyAsync` +- streams are not parameters, but rather contexts: + +```python +stream = cp.cuda.Stream() +with stream: + ... # kernel calls etc will go onto this stream + +# alternative: +stream.use() +... # next kernel calls will use that stream +``` + + +## Sticky Points + +### Memory Pool + +- cupy uses a device memory pool by default, which re-uses freed memory blocks +- the pool is empty at the start and new allocations are using the regular cudaAlloc functions +- once blocks are freed, they are not given back to the device with cudaFree, but are rather + kept in a free list and re-used in further allocations +- therefore the flag for using device memory pool that some engines had made no sense +- this also affects are total available memory should be calculated - it is in fact the free + device memory + the free memory in the pool + +### Page-locked Memory Pool + +- cupy also uses a `PinnedMemoryPool` for obtaining page-locked blocks +- these will be kept in a free list when they are not required anymore +- it works similar to the `DeviceMemoryPool` + +### Context Management + +- cupy does not have explicit context creation or deletion of the context +- everything runs in the CUDA runtime's default context (created on first use by default) +- no functions are available to pop the context (as in PyCuda), so need to be + careful with cleanup + + +### Kernel Compilation + +- cupy uses NVTRC, which is slightly different to NVCC. +- the generated device code is not exactly the same for some reason +- Kernels might therefore perform a little bit different - faster or slower, but tests showed + that they are largely equivalent in performance diff --git a/ptypy/accelerate/cuda_pycuda/__init__.py b/ptypy/accelerate/cuda_pycuda/__init__.py index e6c51d49f..3ce6a7a6e 100644 --- a/ptypy/accelerate/cuda_pycuda/__init__.py +++ b/ptypy/accelerate/cuda_pycuda/__init__.py @@ -2,9 +2,8 @@ from pycuda.compiler import SourceModule import numpy as np import os -# debug_options = [] -# debug_options = ['-O0', '-G', '-g'] -debug_options = ['-O3', '-DNDEBUG', '-lineinfo'] # release mode flags +kernel_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', 'cuda_common')) +debug_options = ['-O3', '-DNDEBUG', '-lineinfo', '-I' + kernel_dir] # release mode flags # C++14 support was added with CUDA 9, so we only enable the flag there if cuda.get_version()[0] >= 9: @@ -46,11 +45,11 @@ def load_kernel(name, subs={}, file=None): if file is None: if isinstance(name, str): - fn = "%s/cuda/%s.cu" % (os.path.dirname(__file__), name) + fn = "%s/%s.cu" % (kernel_dir, name) else: raise ValueError("name parameter must be a string if not filename is given") else: - fn = "%s/cuda/%s" % (os.path.dirname(__file__), file) + fn = "%s/%s" % (kernel_dir, file) with open(fn, 'r') as f: kernel = f.read() diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 7c2de8f3f..2abd02ba4 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -1,26 +1,11 @@ +from ptypy.accelerate.cuda_common.utils import map2ctype + from . import load_kernel from pycuda import gpuarray import pycuda.driver as cuda from ptypy.utils import gaussian import numpy as np -# maps a numpy dtype to the corresponding C type -def map2ctype(dt): - if dt == np.float32: - return 'float' - elif dt == np.float64: - return 'double' - elif dt == np.complex64: - return 'complex' - elif dt == np.complex128: - return 'complex' - elif dt == np.int32: - return 'int' - elif dt == np.int64: - return 'long long' - else: - raise ValueError('No mapping for {}'.format(dt)) - class ArrayUtilsKernel: def __init__(self, acc_dtype=np.float64, queue=None): diff --git a/pyproject.toml b/pyproject.toml index dcfadf665..635431745 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -68,4 +68,4 @@ ptypy = "ptypy" [tool.setuptools.package-data] ptypy = ["resources/*",] -"ptypy.accelerate.cuda_pycuda.cuda" = ["*.cu"] \ No newline at end of file +"ptypy.accelerate.cuda_common" = ["*.cu", "*.cuh"] \ No newline at end of file diff --git a/templates/accelerate/ptypy_minimal_prep_and_run_cupy.py b/templates/accelerate/ptypy_minimal_prep_and_run_cupy.py new file mode 100644 index 000000000..14c862fb1 --- /dev/null +++ b/templates/accelerate/ptypy_minimal_prep_and_run_cupy.py @@ -0,0 +1,54 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines(arch="cupy") + +import tempfile +tmpdir = tempfile.gettempdir() + +p = u.Param() + +# for verbose output +p.verbose_level = "info" +p.frames_per_block = 200 + +# set home path +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False) +p.io.interaction = u.Param(active=False) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_cupy' +p.engines.engine00.numiter = 80 + +# prepare and run +if __name__ == "__main__": + P = Ptycho(p,level=5) diff --git a/test/accelerate_tests/cuda_cupy_tests/__init__.py b/test/accelerate_tests/cuda_cupy_tests/__init__.py new file mode 100644 index 000000000..7df79ac0f --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/__init__.py @@ -0,0 +1,33 @@ +import unittest +import numpy as np +import importlib + +# shall we run performance tests? +perfrun = False + +def have_cupy(): + if importlib.util.find_spec('cupy') is None: + return False + try: + import cupy as cp + cp.cuda.Device(0).compute_capability + return True + except cp.cuda.runtime.CUDARuntimeError: + return False + +if have_cupy(): + import cupy as cp + +@unittest.skipIf(not have_cupy(), "no cupy available") +class CupyCudaTest(unittest.TestCase): + + def setUp(self): + import sys + np.set_printoptions(threshold=sys.maxsize, linewidth=np.inf) + self.stream = cp.cuda.Stream() + self.stream.use() + + def tearDown(self): + np.set_printoptions() + # back to default stream + cp.cuda.Stream.null.use() \ No newline at end of file diff --git a/test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py b/test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py new file mode 100644 index 000000000..c59fb852d --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/address_manglers_test.py @@ -0,0 +1,77 @@ +import unittest +import numpy as np +from . import perfrun, CupyCudaTest, have_cupy +from ptypy.accelerate.base import address_manglers as am + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy import address_manglers as gam + + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class AddressManglersTest(CupyCudaTest): + + def prepare_addresses(self, max_bound=10, scan_pts=2, num_modes=3): + total_number_scan_positions = scan_pts ** 2 + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + max_bound # max bound is added in the DM_serial engine. + Y = Y.reshape((total_number_scan_positions)) + max_bound + + addr_original = np.zeros((total_number_scan_positions, num_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(num_modes): + for ob_mode in range(1): + addr_original[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + return addr_original + + def test_get_address_REGRESSION(self): + # the other manglers are using the BaseMangler's get_address function + # so we set the deltas in a BaseMangler object and test get_address + + scan_pts=2 + addr_original = self.prepare_addresses(scan_pts=scan_pts) + addr_original_dev = cp.asarray(addr_original) + nshifts=1 + step_size=2 + mglr = gam.BaseMangler(step_size, 50, 100, nshifts, max_bound=2) + # 2 shifts, with positive/negative shifting + mglr.delta = np.array([ + [1, 2], + [-4, -2] + ], dtype=np.int32) + mglr._setup_delta_gpu() + + addr1 = addr_original_dev.copy() + mglr.get_address(0, addr_original_dev, addr1, 10, 9) + + addr2 = addr_original_dev.copy() + mglr.get_address(1, addr_original_dev, addr2, 10, 9) + + exp1 = np.copy(addr_original) + exp2 = np.copy(addr_original) + # element-wise here to prepare reference + for f in range(addr_original.shape[0]): + for m in range(addr_original.shape[1]): + exp1[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] + 1)) + exp1[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] + 2)) + exp2[f, m, 1, 1] = max(0, min(10, addr_original[f, m, 1, 1] - 4)) + exp2[f, m, 1, 2] = max(0, min(9, addr_original[f, m, 1, 2] - 2)) + + np.testing.assert_array_equal(addr2.get(), exp2) + np.testing.assert_array_equal(addr1.get(), exp1) + diff --git a/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py new file mode 100644 index 000000000..0c018b205 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py @@ -0,0 +1,536 @@ +import unittest +import numpy as np +from . import perfrun, CupyCudaTest, have_cupy +from ptypy.accelerate.base import array_utils as au + +if have_cupy(): + import cupy as cp + import ptypy.accelerate.cuda_cupy.array_utils as gau + + +class ArrayUtilsTest(CupyCudaTest): + + def test_dot_float_float(self): + # Arrange + X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32) + A = 10 ** Y + A_dev = cp.asarray(A) + + # Act + AU = gau.ArrayUtilsKernel(acc_dtype=np.float32) + out_dev = AU.dot(A_dev, A_dev) + out = cp.asnumpy(out_dev) + + # Assert + np.testing.assert_allclose(out, 30333303.0, rtol=1e-7) + + def test_dot_float_double(self): + # Arrange + X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32) + A = 10 ** Y + A_dev = cp.asarray(A) + + # Act + AU = gau.ArrayUtilsKernel(acc_dtype=np.float64) + out_dev = AU.dot(A_dev, A_dev) + out = cp.asnumpy(out_dev) + + # Assert + np.testing.assert_equal(out, 30333303.0) + + def test_dot_complex_float(self): + # Arrange + X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32) + A = 10 ** Y + 1j * 10 ** X + A = A.astype(np.complex64) + A_dev = cp.asarray(A) + + # Act + AU = gau.ArrayUtilsKernel(acc_dtype=np.float32) + out_dev = AU.dot(A_dev, A_dev) + out = cp.asnumpy(out_dev) + + # Assert + np.testing.assert_allclose(out, 60666606.0, rtol=1e-7) + + def test_dot_complex_double(self): + # Arrange + X, Y, Z = np.indices((3, 3, 1001), dtype=np.float32) + A = 10 ** Y + 1j * 10 ** X + A_dev = cp.asarray(A) + + # Act + AU = gau.ArrayUtilsKernel(acc_dtype=np.float64) + out_dev = AU.dot(A_dev, A_dev) + out = cp.asnumpy(out_dev) + + # Assert + np.testing.assert_array_equal(out, 60666606.0) + + @unittest.skipIf(not perfrun, "Performance test") + def test_dot_performance(self): + # Arrange + X, Y, Z = np.indices((3, 3, 1021301), dtype=np.float32) + A = 10 ** Y + 1j * 10 ** X + A_dev = cp.asarray(A) + + # Act + AU = gau.ArrayUtilsKernel(acc_dtype=np.float64) + AU.dot(A_dev, A_dev) + + def test_transpose_2D(self): + # Arrange + inp, _ = np.indices((5, 3), dtype=np.int32) + inp_dev = cp.asarray(inp) + out_dev = cp.empty((3, 5), dtype=np.int32) + + # Act + AU = gau.TransposeKernel() + AU.transpose(inp_dev, out_dev) + + # Assert + out_exp = np.transpose(inp, (1, 0)) + out = cp.asnumpy(out_dev) + np.testing.assert_array_equal(out, out_exp) + + def test_transpose_2D_large(self): + # Arrange + inp, _ = np.indices((137, 61), dtype=np.int32) + inp_dev = cp.asarray(inp) + out_dev = cp.empty((61, 137), dtype=np.int32) + + # Act + AU = gau.TransposeKernel() + AU.transpose(inp_dev, out_dev) + + # Assert + out_exp = np.transpose(inp, (1, 0)) + out = cp.asnumpy(out_dev) + np.testing.assert_array_equal(out, out_exp) + + def test_transpose_4D(self): + # Arrange + inp = np.random.randint(0, 10000, (250, 3, 5, 3), + dtype=np.int32) # like addr + inp_dev = cp.asarray(inp) + out_dev = cp.empty((5, 3, 250, 3), dtype=np.int32) + + # Act + AU = gau.TransposeKernel() + AU.transpose(inp_dev.reshape(750, 15), out_dev.reshape(15, 750)) + + # Assert + out_exp = np.transpose(inp, (2, 3, 0, 1)) + out = cp.asnumpy(out_dev) + np.testing.assert_array_equal(out, out_exp) + + def test_complex_gaussian_filter_1d_no_blurring_UNITY(self): + # Arrange + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j + mfs = [0] + data_dev = cp.asarray(data) + tmp_dev = cp.empty((11,), dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs, tmp=tmp_dev) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + self.assertTrue(np.testing.assert_allclose(out_exp, out, rtol=1e-5) is None) + + def test_complex_gaussian_filter_1d_little_blurring_UNITY(self): + # Arrange + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j + mfs = [0.2] + data_dev = cp.asarray(data) + tmp_dev = cp.empty((11,), dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs, tmp=tmp_dev) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + np.testing.assert_allclose(out_exp, out, rtol=1e-5) + + + def test_complex_gaussian_filter_1d_more_blurring_UNITY(self): + # Arrange + data = np.zeros((11,), dtype=np.complex64) + data[5] = 1.0 +1.0j + mfs = [2.0] + data_dev = cp.asarray(data) + tmp_dev = cp.empty((11,), dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs, tmp=tmp_dev) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + np.testing.assert_allclose(out_exp, out, rtol=1e-5) + + def test_complex_gaussian_filter_2d_no_blurring_UNITY(self): + # Arrange + data = np.zeros((11, 11), dtype=np.complex64) + data[5, 5] = 1.0+1.0j + mfs = 0.0,0.0 + data_dev = cp.asarray(data) + tmp_dev = cp.empty((11,11), dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs, tmp=tmp_dev) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + np.testing.assert_allclose(out_exp, out, rtol=1e-5) + + def test_complex_gaussian_filter_2d_little_blurring_UNITY(self): + # Arrange + data = np.zeros((11, 11), dtype=np.complex64) + data[5, 5] = 1.0+1.0j + mfs = 0.2,0.2 + data_dev = cp.asarray(data) + tmp_dev = cp.empty((11,11),dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs, tmp=tmp_dev) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + np.testing.assert_allclose(out_exp, out, rtol=1e-5) + + def test_complex_gaussian_filter_2d_more_blurring_UNITY(self): + # Arrange + data = np.zeros((8, 8), dtype=np.complex64) + data[3:5, 3:5] = 2.0+2.0j + mfs = 3.0,4.0 + data_dev = cp.asarray(data) + #tmp_dev = cp.empty((8,8), dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + np.testing.assert_allclose(out_exp, out, rtol=1e-4) + + def test_complex_gaussian_filter_2d_nonsquare_UNITY(self): + # Arrange + data = np.zeros((32, 16), dtype=np.complex64) + data[3:4, 11:12] = 2.0+2.0j + data[3:5, 3:5] = 2.0+2.0j + data[20:25,3:5] = 2.0+2.0j + mfs = 1.0,1.0 + data_dev = cp.asarray(data) + tmp_dev = cp.empty(data_dev.shape, dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs, tmp=tmp_dev) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, rtol=1e-4) + + def test_complex_gaussian_filter_2d_batched(self): + # Arrange + batch_number = 2 + A = 5 + B = 5 + data = np.zeros((batch_number, A, B), dtype=np.complex64) + data[:, 2:3, 2:3] = 2.0+2.0j + mfs = 3.0,4.0 + data_dev = cp.asarray(data) + tmp_dev = cp.empty((batch_number,A,B), dtype=np.complex64) + + # Act + GS = gau.GaussianSmoothingKernel() + GS.convolution(data_dev, mfs, tmp=tmp_dev) + + # Assert + out_exp = au.complex_gaussian_filter(data, mfs) + out = data_dev.get() + np.testing.assert_allclose(out_exp, out, rtol=1e-4) + + + + def test_crop_pad_simple_1_UNITY(self): + # pad, integer, 2D + B = np.indices((4, 4), dtype=np.int32).sum(0) + A = np.zeros((6, 6), dtype=B.dtype) + B_dev = cp.asarray(B) + A_dev = cp.asarray(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_2_UNITY(self): + # crop, float, 3D + B = np.indices((4, 4), dtype=np.float32) + A = np.zeros((2, 2, 2), dtype=B.dtype) + B_dev = cp.asarray(B) + A_dev = cp.asarray(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_3_UNITY(self): + # crop/pad, complex, 3D + B = np.indices((4, 3), dtype=np.complex64) + B = np.indices((4, 3), dtype=np.complex64) + 1j * B[::-1, :, :] + A = np.zeros((2, 2, 5), dtype=B.dtype) + B_dev = cp.asarray(B) + A_dev = cp.asarray(A) + + # Act + au.crop_pad_2d_simple(A, B) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, B_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_difflike_UNITY(self): + np.random.seed(1983) + # crop/pad, 4D + D = np.random.randint(0, 3000, (100, 256, 256)).astype(np.float32) + A = np.zeros((100, 260, 260), dtype=D.dtype) + B = np.zeros((100, 250, 250), dtype=D.dtype) + B_dev = cp.asarray(B) + A_dev = cp.asarray(A) + D_dev = cp.asarray(D) + + # Act + au.crop_pad_2d_simple(A, D) + au.crop_pad_2d_simple(B, D) + k = gau.CropPadKernel(queue=self.stream) + k.crop_pad_2d_simple(A_dev, D_dev) + k.crop_pad_2d_simple(B_dev, D_dev) + + # Assert + np.testing.assert_allclose(A, A_dev.get(), rtol=1e-6, atol=1e-6) + np.testing.assert_allclose(B, B_dev.get(), rtol=1e-6, atol=1e-6) + + def test_crop_pad_simple_oblike_UNITY(self): + np.random.seed(1983) + X = np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32) + + out = np.zeros((1,), dtype=np.float32) + X_dev = cp.asarray(X) + out_dev = cp.asarray(out) + + out = au.max_abs2(X) + + MAK = gau.MaxAbs2Kernel(queue=self.stream) + MAK.max_abs2(X_dev, out_dev) + + np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, + err_msg="The object norm array has not been updated as expected") + + def test_max_abs2_complex_UNITY(self): + np.random.seed(1983) + X = (np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32) + + 1j * np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32)).astype(np.complex64) + out = np.zeros((1,), dtype=np.float32) + X_dev = cp.asarray(X) + out_dev = cp.asarray(out) + + out = au.max_abs2(X) + + MAK = gau.MaxAbs2Kernel(queue=self.stream) + MAK.max_abs2(X_dev, out_dev) + + np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, + err_msg="The object norm array has not been updated as expected") + + def test_max_abs2_float_UNITY(self): + np.random.seed(1983) + X = np.random.randint(-1000, 1000, (3, 100, 200)).astype(np.float32) + + out = np.zeros((1,), dtype=np.float32) + X_dev = cp.asarray(X) + out_dev = cp.asarray(out) + + out = au.max_abs2(X) + + MAK = gau.MaxAbs2Kernel(queue=self.stream) + MAK.max_abs2(X_dev, out_dev) + + np.testing.assert_allclose(out_dev.get(), out, rtol=1e-6, atol=1e-6, + err_msg="The object norm array has not been updated as expected") + + + def test_clip_magnitudes_to_range_UNITY(self): + np.random.seed(1987) + A = np.random.random((2,10,10)) + B = A[0] + 1j* A[1] + B = B.astype(np.complex64) + B_gpu = cp.asarray(B) + + au.clip_complex_magnitudes_to_range(B, 0.2,0.8) + CMK = gau.ClipMagnitudesKernel() + CMK.clip_magnitudes_to_range(B_gpu, 0.2, 0.8) + + np.testing.assert_allclose(B_gpu.get(), B, rtol=1e-6, atol=1e-6, + err_msg="The magnitudes of the array have not been clipped as expected") + + def test_mass_center_2d_UNITY(self): + np.random.seed(1987) + A = np.random.random((128, 128)).astype(np.float32) + A_gpu = cp.asarray(A) + + out = au.mass_center(A) + + MCK = gau.MassCenterKernel() + mc_d = MCK.mass_center(A_gpu) + mc = mc_d.get() + + np.testing.assert_allclose(out, mc, rtol=1e-6, atol=1e-6, + err_msg="The centre of mass of the array has not been calculated as expected") + + + def test_mass_center_3d_UNITY(self): + np.random.seed(1987) + A = np.random.random((128, 128, 128)).astype(np.float32) + A_gpu = cp.asarray(A) + + out = au.mass_center(A) + + MCK = gau.MassCenterKernel() + mc_d = MCK.mass_center(A_gpu) + mc = mc_d.get() + + np.testing.assert_allclose(out, mc, rtol=1e-6, atol=1e-6, + err_msg="The centre of mass of the array has not been calculated as expected") + + def test_abs2sum_complex_float_UNITY(self): + np.random.seed(1987) + A = np.random.random((3, 321, 123)).astype(np.float32) + B = A + A**2 * 1j + B_gpu = cp.asarray(B) + + out = au.abs2(B).sum(0) + + A2SK = gau.Abs2SumKernel(dtype=B_gpu.dtype) + a2s_d = A2SK.abs2sum(B_gpu) + a2s = a2s_d.get() + + np.testing.assert_allclose(out, a2s, rtol=1e-6, atol=1e-6, + err_msg="The sum of absolute values along the first dimension has not been calculated as expected") + + def test_abs2sum_complex_double_UNITY(self): + np.random.seed(1987) + A = np.random.random((3, 321, 123)).astype(np.float64) + B = A + A**2 * 1j + B_gpu = cp.asarray(B) + + out = au.abs2(B).sum(0) + + A2SK = gau.Abs2SumKernel(dtype=B_gpu.dtype) + a2s_d = A2SK.abs2sum(B_gpu) + a2s = a2s_d.get() + + np.testing.assert_allclose(out, a2s, rtol=1e-6, atol=1e-6, + err_msg="The sum of absolute values along the first dimension has not been calculated as expected") + + def test_interpolate_shift_2D_UNITY(self): + np.random.seed(1987) + A = np.random.random((259, 252)).astype(np.float32) + A = A + A**2 * 1j + A_gpu = cp.asarray(A) + + cen_old = np.array([100.123, 5.678]).astype(np.float32) + cen_new = np.array([128.5, 127.5]).astype(np.float32) + shift = cen_new - cen_old + + out = au.interpolated_shift(A, shift, do_linear=True) + + ISK = gau.InterpolatedShiftKernel() + isk_d = ISK.interpolate_shift(A_gpu, shift) + isk = isk_d.get() + + np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6, + err_msg="The shifting of array has not been calculated as expected") + + def test_interpolate_shift_3D_UNITY(self): + np.random.seed(1987) + A = np.random.random((3, 200, 300)).astype(np.float32) + A = A + A**2 * 1j + A_gpu = cp.asarray(A) + + cen_old = np.array([0., 180.123, 5.678]).astype(np.float32) + cen_new = np.array([0., 128.5, 127.5]).astype(np.float32) + shift = cen_new - cen_old + + out = au.interpolated_shift(A, shift, do_linear=True) + + ISK = gau.InterpolatedShiftKernel() + isk_d = ISK.interpolate_shift(A_gpu, shift[1:]) + isk = isk_d.get() + + np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6, + err_msg="The shifting of array has not been calculated as expected") + + def test_interpolate_shift_integer_UNITY(self): + np.random.seed(1987) + A = np.random.random((3, 200, 300)).astype(np.float32) + A = A + A**2 * 1j + A_gpu = cp.asarray(A) + + cen_old = np.array([0, 180, 5]).astype(np.float32) + cen_new = np.array([0, 128, 127]).astype(np.float32) + shift = cen_new - cen_old + + out = au.interpolated_shift(A, shift, do_linear=True) + + ISK = gau.InterpolatedShiftKernel() + isk_d = ISK.interpolate_shift(A_gpu, shift[1:]) + isk = isk_d.get() + + np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6, + err_msg="The shifting of array has not been calculated as expected") + + def test_interpolate_shift_no_shift_UNITY(self): + np.random.seed(1987) + A = np.random.random((3, 200, 300)).astype(np.float32) + A = A + A**2 * 1j + A_gpu = cp.asarray(A) + + cen_old = np.array([0, 0, 0]).astype(np.float32) + cen_new = np.array([0, 0, 0]).astype(np.float32) + shift = cen_new - cen_old + + out = au.interpolated_shift(A, shift, do_linear=True) + + ISK = gau.InterpolatedShiftKernel() + isk_d = ISK.interpolate_shift(A_gpu, shift[1:]) + isk = isk_d.get() + + np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6, + err_msg="The shifting of array has not been calculated as expected") + diff --git a/test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py new file mode 100644 index 000000000..df27077e2 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/auxiliary_wave_kernel_test.py @@ -0,0 +1,666 @@ +''' + + +''' + +import unittest +import numpy as np +from . import perfrun, CupyCudaTest, have_cupy + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.kernels import AuxiliaryWaveKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class AuxiliaryWaveKernelTest(CupyCudaTest): + + def prepare_arrays(self, performance=False, scan_points=None): + if not performance: + B = 3 # frame size y + C = 3 # frame size x + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points + else: + B = 128 + C = 128 + D = 2 + E = B + F = C + npts_greater_than = 1215 + G = 4 + if scan_points is None: + scan_pts = 14 + else: + scan_pts = scan_points + + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + if performance: + print('addr={}, obj={}, pr={}, ex={}'.format(addr.shape, object_array.shape, probe.shape, exit_wave.shape)) + # assert False + + return addr, object_array, probe, exit_wave + + def copy_to_gpu(self, addr, object_array, probe, exit_wave): + return (cp.asarray(addr), + cp.asarray(object_array), + cp.asarray(probe), + cp.asarray(exit_wave)) + + def test_init(self): + # should we really test for private attributes? + # Only the public interface should be checked - what clients rely on + attrs = ["_ob_shape", + "_ob_id"] + + AWK = AuxiliaryWaveKernel(self.stream) + for attr in attrs: + self.assertTrue(hasattr(AWK, attr), msg="AuxiliaryWaveKernel does not have attribute: %s" % attr) + + np.testing.assert_equal(AWK.kernels, + ['build_aux', 'build_exit'], + err_msg='AuxiliaryWaveKernel does not have the correct functions registered.') + + def test_build_aux_same_as_exit_REGRESSION(self): + ## Arrange + cpudata = self.prepare_arrays() + addr, object_array, probe, exit_wave = self.copy_to_gpu(*cpudata) + auxiliary_wave = cp.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + alpha_set = FLOAT_TYPE(1.0) + + AWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set) + + expected_auxiliary_wave = np.array([[[-1. + 3.j, -1. + 3.j, -1. + 3.j], + [-1. + 3.j, -1. + 3.j, -1. + 3.j], + [-1. + 3.j, -1. + 3.j, -1. + 3.j]], + [[-2.+14.j, -2.+14.j, -2.+14.j], + [-2.+14.j, -2.+14.j, -2.+14.j], + [-2.+14.j, -2.+14.j, -2.+14.j]], + [[-3. + 5.j, -3. + 5.j, -3. + 5.j], + [-3. + 5.j, -3. + 5.j, -3. + 5.j], + [-3. + 5.j, -3. + 5.j, -3. + 5.j]], + [[-4.+28.j, -4.+28.j, -4.+28.j], + [-4.+28.j, -4.+28.j, -4.+28.j], + [-4.+28.j, -4.+28.j, -4.+28.j]], + [[-5. - 1.j, -5. - 1.j, -5. - 1.j], + [-5. - 1.j, -5. - 1.j, -5. - 1.j], + [-5. - 1.j, -5. - 1.j, -5. - 1.j]], + [[-6.+10.j, -6.+10.j, -6.+10.j], + [-6.+10.j, -6.+10.j, -6.+10.j], + [-6.+10.j, -6.+10.j, -6.+10.j]], + [[-7. + 1.j, -7. + 1.j, -7. + 1.j], + [-7. + 1.j, -7. + 1.j, -7. + 1.j], + [-7. + 1.j, -7. + 1.j, -7. + 1.j]], + [[-8.+24.j, -8.+24.j, -8.+24.j], + [-8.+24.j, -8.+24.j, -8.+24.j], + [-8.+24.j, -8.+24.j, -8.+24.j]], + [[-9. - 5.j, -9. - 5.j, -9. - 5.j], + [-9. - 5.j, -9. - 5.j, -9. - 5.j], + [-9. - 5.j, -9. - 5.j, -9. - 5.j]], + [[-10. + 6.j, -10. + 6.j, -10. + 6.j], + [-10. + 6.j, -10. + 6.j, -10. + 6.j], + [-10. + 6.j, -10. + 6.j, -10. + 6.j]], + [[-11. - 3.j, -11. - 3.j, -11. - 3.j], + [-11. - 3.j, -11. - 3.j, -11. - 3.j], + [-11. - 3.j, -11. - 3.j, -11. - 3.j]], + [[-12.+20.j, -12.+20.j, -12.+20.j], + [-12.+20.j, -12.+20.j, -12.+20.j], + [-12.+20.j, -12.+20.j, -12.+20.j]], + [[-13. - 9.j, -13. - 9.j, -13. - 9.j], + [-13. - 9.j, -13. - 9.j, -13. - 9.j], + [-13. - 9.j, -13. - 9.j, -13. - 9.j]], + [[-14. + 2.j, -14. + 2.j, -14. + 2.j], + [-14. + 2.j, -14. + 2.j, -14. + 2.j], + [-14. + 2.j, -14. + 2.j, -14. + 2.j]], + [[-15. - 7.j, -15. - 7.j, -15. - 7.j], + [-15. - 7.j, -15. - 7.j, -15. - 7.j], + [-15. - 7.j, -15. - 7.j, -15. - 7.j]], + [[-16.+16.j, -16.+16.j, -16.+16.j], + [-16.+16.j, -16.+16.j, -16.+16.j], + [-16.+16.j, -16.+16.j, -16.+16.j]]], dtype=COMPLEX_TYPE) + + np.testing.assert_array_equal(expected_auxiliary_wave, auxiliary_wave.get(), + err_msg="The auxiliary_wave has not been updated as expected") + + + def test_build_aux_same_as_exit_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = np.zeros_like(exit_wave) + auxiliary_wave_dev = cp.zeros_like(exit_wave_dev) + + ## Act + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + AWK = AuxiliaryWaveKernel(self.stream) + alpha_set = FLOAT_TYPE(.75) + + AWK.build_aux(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=alpha_set) + nAWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(), + err_msg="The gpu auxiliary_wave does not look the same as the numpy version") + + def test_build_aux2_same_as_exit_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = np.zeros_like(exit_wave) + auxiliary_wave_dev = cp.zeros_like(exit_wave_dev) + + ## Act + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + AWK = AuxiliaryWaveKernel(self.stream) + alpha_set = FLOAT_TYPE(.75) + + AWK.build_aux2(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=alpha_set) + nAWK.build_aux(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=alpha_set) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(), + err_msg="The gpu auxiliary_wave does not look the same as the numpy version") + + def test_build_exit_aux_same_as_exit_REGRESSION(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = cp.zeros_like(exit_wave_dev) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + alpha_set = 1.0 + AWK.build_exit(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev) + + ## Assert + expected_auxiliary_wave = np.array([[[0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j]], + [[0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j]], + [[0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j]], + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]], + [[0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j]], + [[0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j]], + [[0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j]], + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]], + [[0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j]], + [[0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j]], + [[0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j]], + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]], + [[0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j], + [0. - 2.j, 0. - 2.j, 0. - 2.j]], + [[0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j], + [0. - 8.j, 0. - 8.j, 0. - 8.j]], + [[0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j], + [0. - 4.j, 0. - 4.j, 0. - 4.j]], + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=COMPLEX_TYPE) + + np.testing.assert_array_equal(expected_auxiliary_wave, auxiliary_wave_dev.get(), + err_msg="The auxiliary_wave has not been updated as expected") + + expected_exit_wave = np.array([[[1. - 1.j, 1. - 1.j, 1. - 1.j], + [1. - 1.j, 1. - 1.j, 1. - 1.j], + [1. - 1.j, 1. - 1.j, 1. - 1.j]], + [[2. - 6.j, 2. - 6.j, 2. - 6.j], + [2. - 6.j, 2. - 6.j, 2. - 6.j], + [2. - 6.j, 2. - 6.j, 2. - 6.j]], + [[3. - 1.j, 3. - 1.j, 3. - 1.j], + [3. - 1.j, 3. - 1.j, 3. - 1.j], + [3. - 1.j, 3. - 1.j, 3. - 1.j]], + [[4. - 12.j, 4. - 12.j, 4. - 12.j], + [4. - 12.j, 4. - 12.j, 4. - 12.j], + [4. - 12.j, 4. - 12.j, 4. - 12.j]], + [[5. + 3.j, 5. + 3.j, 5. + 3.j], + [5. + 3.j, 5. + 3.j, 5. + 3.j], + [5. + 3.j, 5. + 3.j, 5. + 3.j]], + [[6. - 2.j, 6. - 2.j, 6. - 2.j], + [6. - 2.j, 6. - 2.j, 6. - 2.j], + [6. - 2.j, 6. - 2.j, 6. - 2.j]], + [[7. + 3.j, 7. + 3.j, 7. + 3.j], + [7. + 3.j, 7. + 3.j, 7. + 3.j], + [7. + 3.j, 7. + 3.j, 7. + 3.j]], + [[8. - 8.j, 8. - 8.j, 8. - 8.j], + [8. - 8.j, 8. - 8.j, 8. - 8.j], + [8. - 8.j, 8. - 8.j, 8. - 8.j]], + [[9. + 7.j, 9. + 7.j, 9. + 7.j], + [9. + 7.j, 9. + 7.j, 9. + 7.j], + [9. + 7.j, 9. + 7.j, 9. + 7.j]], + [[10. + 2.j, 10. + 2.j, 10. + 2.j], + [10. + 2.j, 10. + 2.j, 10. + 2.j], + [10. + 2.j, 10. + 2.j, 10. + 2.j]], + [[11. + 7.j, 11. + 7.j, 11. + 7.j], + [11. + 7.j, 11. + 7.j, 11. + 7.j], + [11. + 7.j, 11. + 7.j, 11. + 7.j]], + [[12. - 4.j, 12. - 4.j, 12. - 4.j], + [12. - 4.j, 12. - 4.j, 12. - 4.j], + [12. - 4.j, 12. - 4.j, 12. - 4.j]], + [[13. + 11.j, 13. + 11.j, 13. + 11.j], + [13. + 11.j, 13. + 11.j, 13. + 11.j], + [13. + 11.j, 13. + 11.j, 13. + 11.j]], + [[14. + 6.j, 14. + 6.j, 14. + 6.j], + [14. + 6.j, 14. + 6.j, 14. + 6.j], + [14. + 6.j, 14. + 6.j, 14. + 6.j]], + [[15. + 11.j, 15. + 11.j, 15. + 11.j], + [15. + 11.j, 15. + 11.j, 15. + 11.j], + [15. + 11.j, 15. + 11.j, 15. + 11.j]], + [[16. + 0.j, 16. + 0.j, 16. + 0.j], + [16. + 0.j, 16. + 0.j, 16. + 0.j], + [16. + 0.j, 16. + 0.j, 16. + 0.j]]], dtype=COMPLEX_TYPE) + + np.testing.assert_array_equal(expected_exit_wave, exit_wave_dev.get(), + err_msg="The exit_wave has not been updated as expected") + + def test_build_exit_aux_same_as_exit_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = np.zeros_like(exit_wave) + auxiliary_wave_dev = cp.zeros_like(exit_wave_dev) + + ## Act + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + AWK = AuxiliaryWaveKernel(self.stream) + + AWK.build_exit(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev) + nAWK.build_exit(auxiliary_wave, addr, object_array, probe, exit_wave) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave, auxiliary_wave_dev.get(), + err_msg="The gpu auxiliary_wave does not look the same as the numpy version") + + np.testing.assert_array_equal(exit_wave, exit_wave_dev.get(), + err_msg="The gpu exit_wave does not look the same as the numpy version") + + def test_build_aux_no_ex_noadd_REGRESSION(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = cp.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, + fac=1.0, add=False) + + ## Assert + expected_auxiliary_wave = np.array([[[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]], + [[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]], + [[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]], + [[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]]], dtype=np.complex64) + np.testing.assert_array_equal(auxiliary_wave.get(), expected_auxiliary_wave, + err_msg="The auxiliary_wave has not been updated as expected") + + def test_build_aux_no_ex_noadd_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = cp.zeros_like(exit_wave_dev) + auxiliary_wave = np.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=1.0, add=False) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + + def test_build_aux2_no_ex_noadd_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = cp.zeros_like(exit_wave_dev) + auxiliary_wave = np.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=1.0, add=False) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=1.0, add=False) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + + + def test_build_aux_no_ex_add_REGRESSION(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = cp.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + fac = 2.0 + AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=fac, add=True) + + ## Assert + expected_auxiliary_wave = np.array([[[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]], + [[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]], + [[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]], + [[0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j], + [0. + 2.j, 0. + 2.j, 0. + 2.j]], + [[0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j], + [0. + 8.j, 0. + 8.j, 0. + 8.j]], + [[0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j], + [0. + 4.j, 0. + 4.j, 0. + 4.j]], + [[0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j], + [0. + 16.j, 0. + 16.j, 0. + 16.j]]], dtype=np.complex64) + expected_auxiliary_wave = fac*expected_auxiliary_wave + 1 + np.testing.assert_array_equal(auxiliary_wave.get(), expected_auxiliary_wave, + err_msg="The auxiliary_wave has not been updated as expected") + + def test_build_aux_no_ex_add_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = cp.ones_like(exit_wave_dev) + auxiliary_wave = np.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=2.0, add=True) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + + def test_build_aux2_no_ex_add_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays() + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = cp.ones_like(exit_wave_dev) + auxiliary_wave = np.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux2_no_ex(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, + fac=2.0, add=True) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, fac=2.0, add=True) + + ## Assert + np.testing.assert_array_equal(auxiliary_wave_dev.get(), auxiliary_wave, + err_msg="The auxiliary_wave does not match numpy") + + + @unittest.skipIf(not perfrun, "performance test") + def test_build_aux_no_ex_performance(self): + addr, object_array, probe, exit_wave = self.prepare_arrays(performance=True) + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = cp.zeros_like(exit_wave) + + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_aux_no_ex(auxiliary_wave, addr, object_array, probe, + fac=1.0, add=False) + + + def test_build_exit_alpha_tau_REGRESSION(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = cp.zeros_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave) + + # Assert + expected_auxiliary_wave = np.array( + [[[0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j], + [0. -2.j, 0. -2.j, 0. -2.j]], + + [[0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j], + [0. -8.j, 0. -8.j, 0. -8.j]], + + [[0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j], + [0. -4.j, 0. -4.j, 0. -4.j]], + + [[0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j], + [0.-16.j, 0.-16.j, 0.-16.j]]], dtype=np.complex64) + np.testing.assert_allclose(auxiliary_wave.get(), expected_auxiliary_wave, rtol=1e-6, atol=1e-6, + err_msg="The auxiliary_wave has not been updated as expected") + + expected_exit_wave = np.array( + [[[1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j], + [1. -1.j, 1. -1.j, 1. -1.j]], + + [[2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j], + [2. -6.j, 2. -6.j, 2. -6.j]], + + [[3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j], + [3. -1.j, 3. -1.j, 3. -1.j]], + + [[4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j], + [4.-12.j, 4.-12.j, 4.-12.j]]], dtype=np.complex64) + np.testing.assert_allclose(exit_wave.get(), expected_exit_wave, rtol=1e-6, atol=1e-6, + err_msg="The exit_wave has not been updated as expected") + + def test_build_exit_alpha_tau_UNITY(self): + ## Arrange + addr, object_array, probe, exit_wave = self.prepare_arrays(scan_points=1) + addr_dev, object_array_dev, probe_dev, exit_wave_dev = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave_dev = cp.ones_like(exit_wave_dev) + auxiliary_wave = np.ones_like(exit_wave) + + ## Act + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave_dev, addr_dev, object_array_dev, probe_dev, exit_wave_dev, alpha=0.8, tau=0.6) + from ptypy.accelerate.base.kernels import AuxiliaryWaveKernel as npAuxiliaryWaveKernel + nAWK = npAuxiliaryWaveKernel() + nAWK.allocate() + nAWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6) + + ## Assert + np.testing.assert_allclose(auxiliary_wave_dev.get(), auxiliary_wave, rtol=1e-6, atol=1e-6, + err_msg="The auxiliary_wave does not match numpy") + ## Assert + np.testing.assert_allclose(exit_wave_dev.get(), exit_wave, rtol=1e-6, atol=1e-6, + err_msg="The exit_wave does not match numpy") + + @unittest.skipIf(not perfrun, "performance test") + def test_build_exit_alpha_tau_performance(self): + addr, object_array, probe, exit_wave = self.prepare_arrays(performance=True, scan_points=1) + addr, object_array, probe, exit_wave = self.copy_to_gpu(addr, object_array, probe, exit_wave) + auxiliary_wave = cp.zeros_like(exit_wave) + + AWK = AuxiliaryWaveKernel(self.stream) + AWK.allocate() + AWK.build_exit_alpha_tau(auxiliary_wave, addr, object_array, probe, exit_wave, alpha=0.8, tau=0.6) + +if __name__ == '__main__': + unittest.main() diff --git a/test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py new file mode 100644 index 000000000..d2235539b --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/derivatives_kernel_test.py @@ -0,0 +1,330 @@ +import unittest +import numpy as np +from . import perfrun, CupyCudaTest, have_cupy +from ptypy.accelerate.base import array_utils as au +from ptypy.utils.math_utils import delxf, delxb + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.array_utils import DerivativesKernel + +class DerivativesKernelTest(CupyCudaTest): + + def test_delxf_1dim(self): + inp = np.array([0, 1, 2, 4, 8, 0, 6], dtype=np.float32) + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev) + + outp[:] = outp_dev.get() + + exp = np.array([1, 1, 2, 4, -8, 6, 0], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxf_1dim_inplace(self): + inp = np.array([0, 1, 2, 4, 8, 0, 6], dtype=np.float32) + inp_dev = cp.asarray(inp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=inp_dev) + + outp = inp_dev.get() + + exp = np.array([1, 1, 2, 4, -8, 6, 0], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxf_2dim1(self): + inp = np.array([ + [0, 2, 6], + [1, -4, 5] + ], dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=0) + + outp[:] = outp_dev.get() + + + exp = np.array([ + [1, -6, -1], + [0, 0, 0] + ], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxf_2dim2(self): + inp = np.array([ + [0, 2, 6], + [1, -4, 5] + ], dtype=np.float32) + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=1) + + outp[:] = outp_dev.get() + + exp = np.array([ + [2, 4, 0], + [-5, 9, 0] + ], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxb_1dim(self): + inp = np.array([0, 1, 2, 4, 8, 0, 6], dtype=np.float32) + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxb(inp_dev, out=outp_dev) + + outp[:] = outp_dev.get() + + exp = np.array([0, 1, 1, 2, 4, -8, 6], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxb_2dim1(self): + inp = np.array([ + [0, 2, 6], + [1, -4, 5] + ], dtype=np.float32) + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxb(inp_dev, out=outp_dev, axis=0) + + outp[:] = outp_dev.get() + + + exp = np.array([ + [0, 0, 0], + [1, -6, -1], + ], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxb_2dim2(self): + inp = np.array([ + [0, 2, 6], + [1, -4, 5] + ], dtype=np.float32) + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxb(inp_dev, out=outp_dev, axis=1) + + outp[:] = outp_dev.get() + + + exp = np.array([ + [0, 2, 4], + [0, -5, 9] + ], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxf_2dim2complex(self): + inp = np.array([ + [0, 2, 6], + [1, -4, 5] + ],dtype=np.float32) + 1j * np.array([ + [0, 4, 12], + [2, -8, 10] + ],dtype=np.float32) + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=1) + + outp[:] = outp_dev.get() + + exp = np.array([ + [2, 4, 0], + [-5, 9, 0] + ], dtype=np.float32) + 1j * np.array([ + [4, 8, 0], + [-10, 18, 0] + ], dtype=np.float32) + np.testing.assert_array_equal(outp, exp) + + def test_delxf_3dim2(self): + inp = np.array([ + [ + [1, 2, 4,], + [7, 11, 16,], + ], + [ + [22, 29, 37,], + [46, 56, 67] + ] + ], dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=1) + + outp[:] = outp_dev.get() + + exp = np.array([ + [ + [6, 9, 12,], + [0, 0, 0,], + ], + [ + [24, 27, 30,], + [0, 0, 0], + ] + ], dtype=np.float32) + + np.testing.assert_array_equal(outp, exp) + + def test_delxf_3dim1_unity(self): + inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=0) + outp[:] = outp_dev.get() + + exp = delxf(inp, axis=0) + np.testing.assert_array_almost_equal(outp, exp) + + def test_delxf_3dim2_unity1(self): + inp = np.array([ + [ [1], [2], [4]], + [ [8], [16], [32]] + ], dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=1) + outp[:] = outp_dev.get() + + exp = delxf(inp, axis=1) + + np.testing.assert_array_almost_equal(np.squeeze(outp), np.squeeze(exp)) + + def test_delxf_3dim2_unity2(self): + inp = np.array([ + [ [1, 2], [4, 7], [11,16] ], + [ [22,29], [37,46], [56,67]] + ], dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=1) + outp[:] = outp_dev.get() + + exp = delxf(inp, axis=1) + + np.testing.assert_array_almost_equal(np.squeeze(outp), np.squeeze(exp)) + + def test_delxf_3dim2_unity(self): + inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=1) + outp[:] = outp_dev.get() + + exp = delxf(inp, axis=1) + np.testing.assert_array_almost_equal(outp, exp) + + def test_delxf_3dim3_unity(self): + inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=2) + outp[:] = outp_dev.get() + + exp = delxf(inp, axis=2) + np.testing.assert_array_almost_equal(outp, exp) + + def test_delxb_3dim3_unity(self): + inp = np.ascontiguousarray(np.random.randn(33, 283, 142), dtype=np.float32) + + inp_dev = cp.asarray(inp) + outp = np.zeros_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxb(inp_dev, out=outp_dev, axis=2) + outp[:] = outp_dev.get() + + exp = delxb(inp, axis=2) + np.testing.assert_array_almost_equal(outp, exp) + + @unittest.skipIf(not perfrun, "performance test") + def test_perf_3d_0(self): + shape = [500, 1024, 1024] + inp = np.ones(shape, dtype=np.complex64) + inp_dev = cp.asarray(inp) + outp = np.ones_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=0) + outp[:] = outp_dev.get() + np.testing.assert_array_equal(outp, 0) + + @unittest.skipIf(not perfrun, "performance test") + def test_perf_3d_1(self): + shape = [500, 1024, 1024] + inp = np.ones(shape, dtype=np.complex64) + inp_dev = cp.asarray(inp) + outp = np.ones_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=1) + outp[:] = outp_dev.get() + np.testing.assert_array_equal(outp, 0) + + @unittest.skipIf(not perfrun, "performance test") + def test_perf_3d_2(self): + shape = [500, 1024, 1024] + inp = np.ones(shape, dtype=np.complex64) + inp_dev = cp.asarray(inp) + outp = np.ones_like(inp) + outp_dev = cp.asarray(outp) + + DK = DerivativesKernel(inp.dtype, queue=self.stream) + DK.delxf(inp_dev, out=outp_dev, axis=2) + outp[:] = outp_dev.get() + np.testing.assert_array_equal(outp, 0) \ No newline at end of file diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_tests.py b/test/accelerate_tests/cuda_cupy_tests/engine_tests.py new file mode 100644 index 000000000..fe70b58bc --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/engine_tests.py @@ -0,0 +1,172 @@ +""" +Test for the ML engine. + +This file is part of the PTYPY package. + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import unittest + +from test import utils as tu +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines("cupy") +import tempfile +import shutil +import numpy as np + +class MLCupyTest(unittest.TestCase): + + def setUp(self): + self.outpath = tempfile.mkdtemp(suffix="ML_cupy_test") + + def tearDown(self): + shutil.rmtree(self.outpath) + + def check_engine_output(self, output, plotting=False, debug=False, scan="MF"): + key = "S%sG00" %scan + P_ML_serial, P_ML_cupy = output + numiter = len(P_ML_serial.runtime["iter_info"]) + LL_ML_serial = np.array([P_ML_serial.runtime["iter_info"][i]["error"][1] for i in range(numiter)]) + LL_ML_cupy = np.array([P_ML_cupy.runtime["iter_info"][i]["error"][1] for i in range(numiter)]) + crop = 42 + OBJ_ML_serial, OBJ_ML_cupy = P_ML_serial.obj.S[key].data[0,crop:-crop,crop:-crop], P_ML_cupy.obj.S[key].data[0,crop:-crop,crop:-crop] + PRB_ML_serial, PRB_ML_cupy = P_ML_serial.probe.S[key].data[0], P_ML_cupy.probe.S[key].data[0] + MED_ML_serial = np.median(np.angle(OBJ_ML_serial)) + MED_ML_cupy = np.median(np.angle(OBJ_ML_cupy)) + eng_ML_serial = P_ML_serial.engines["engine00"] + eng_ML_cupy = P_ML_cupy.engines["engine00"] + if debug: + import matplotlib.pyplot as plt + plt.figure("ML serial debug") + plt.imshow(np.abs(eng_ML_serial.debug)) + plt.figure("ML cupy debug") + plt.imshow(np.abs(eng_ML_cupy.debug)) + plt.show() + + if plotting: + import matplotlib.pyplot as plt + plt.figure("Errors") + plt.plot(LL_ML_serial, label="ML_serial") + plt.plot(LL_ML_cupy, label="ML_cupy") + plt.legend() + plt.show() + plt.figure("Phase ML serial") + plt.imshow(np.angle(OBJ_ML_serial*np.exp(-1j*MED_ML_serial))) + plt.figure("Ampltitude ML serial") + plt.imshow(np.abs(OBJ_ML_serial)) + plt.figure("Phase ML cupy") + plt.imshow(np.angle(OBJ_ML_cupy*np.exp(-1j*MED_ML_cupy))) + plt.figure("Amplitude ML cupy") + plt.imshow(np.abs(OBJ_ML_cupy)) + plt.figure("Phase difference") + plt.imshow(np.angle(OBJ_ML_cupy) - np.angle(OBJ_ML_serial), vmin=-0.1, vmax=0.1) + plt.colorbar() + plt.figure("Amplitude difference") + plt.imshow(np.abs(OBJ_ML_cupy) - np.abs(OBJ_ML_serial), vmin=-0.1, vmax=0.1) + plt.colorbar() + plt.show() + # np.testing.assert_allclose(eng_ML_serial.debug, eng_ML_cupy.debug, atol=1e-7, rtol=1e-7, + # err_msg="The debug arrays are not matching as expected") + RMSE_ob = (np.mean(np.abs(OBJ_ML_cupy - OBJ_ML_serial)**2)) + RMSE_pr = (np.mean(np.abs(PRB_ML_cupy - PRB_ML_serial)**2)) + # RMSE_LL = (np.mean(np.abs(LL_ML_serial - LL_ML)**2)) + np.testing.assert_allclose(RMSE_ob, 0.0, atol=1e-2, + err_msg="The object arrays are not matching as expected") + np.testing.assert_allclose(RMSE_pr, 0.0, atol=1e-2, + err_msg="The object arrays are not matching as expected") + # np.testing.assert_allclose(RMSE_LL, 0.0, atol=1e-7, + # err_msg="The log-likelihood errors are not matching as expected") + + def test_ML_cupy_base(self): + out = [] + for eng in ["ML_serial", "ML_cupy"]: + engine_params = u.Param() + engine_params.name = eng + engine_params.numiter = 100 + engine_params.floating_intensities = False + engine_params.reg_del2 = False + engine_params.reg_del2_amplitude = 1. + engine_params.scale_precond = False + out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True, + scanmodel="BlockFull", autosave=False, verbose_level="critical")) + self.check_engine_output(out, plotting=False, debug=False) + + def test_ML_cupy_regularizer(self): + out = [] + for eng in ["ML_serial", "ML_cupy"]: + engine_params = u.Param() + engine_params.name = eng + engine_params.numiter = 100 + engine_params.floating_intensities = False + engine_params.reg_del2 = True + engine_params.reg_del2_amplitude = 1. + engine_params.scale_precond = False + out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True, + scanmodel="BlockFull", autosave=False, verbose_level="critical")) + self.check_engine_output(out, plotting=False, debug=False) + + def test_ML_cupy_preconditioner(self): + out = [] + for eng in ["ML_serial", "ML_cupy"]: + engine_params = u.Param() + engine_params.name = eng + engine_params.numiter = 100 + engine_params.floating_intensities = False + engine_params.reg_del2 = False + engine_params.reg_del2_amplitude = 1. + engine_params.scale_precond = True + engine_params.scale_probe_object = 1e-6 + out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True, + scanmodel="BlockFull", autosave=False, verbose_level="critical")) + self.check_engine_output(out, plotting=False, debug=False) + + def test_ML_cupy_floating(self): + out = [] + for eng in ["ML_serial", "ML_cupy"]: + engine_params = u.Param() + engine_params.name = eng + engine_params.numiter = 100 + engine_params.floating_intensities = True + engine_params.reg_del2 = False + engine_params.reg_del2_amplitude = 1. + engine_params.scale_precond = False + out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True, + scanmodel="BlockFull", autosave=False, verbose_level="critical")) + self.check_engine_output(out, plotting=False, debug=False) + + def test_ML_cupy_smoothing_regularizer(self): + out = [] + for eng in ["ML_serial", "ML_cupy"]: + engine_params = u.Param() + engine_params.name = eng + engine_params.numiter = 200 + engine_params.floating_intensities = False + engine_params.reg_del2 = False + engine_params.reg_del2_amplitude = 1. + engine_params.smooth_gradient = 20 + engine_params.smooth_gradient_decay = 1/10. + engine_params.scale_precond = False + out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True, + scanmodel="BlockFull", autosave=False, verbose_level="critical")) + self.check_engine_output(out, plotting=False, debug=False) + + def test_ML_cupy_all(self): + out = [] + for eng in ["ML_serial", "ML_cupy"]: + engine_params = u.Param() + engine_params.name = eng + engine_params.numiter = 100 + engine_params.floating_intensities = False + engine_params.reg_del2 = True + engine_params.reg_del2_amplitude = 1. + engine_params.smooth_gradient = 20 + engine_params.smooth_gradient_decay = 1/10. + engine_params.scale_precond = True + engine_params.scale_probe_object = 1e-6 + out.append(tu.EngineTestRunner(engine_params, output_path=self.outpath, init_correct_probe=True, + scanmodel="BlockFull", autosave=False, verbose_level="info")) + self.check_engine_output(out, plotting=False, debug=False) + +if __name__ == "__main__": + unittest.main() diff --git a/test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py b/test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py new file mode 100644 index 000000000..017f85ba4 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/engine_utils_test.py @@ -0,0 +1,52 @@ +''' + + +''' + +import unittest +import numpy as np +from . import perfrun, CupyCudaTest, have_cupy + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.engines.ML_cupy import Regul_del2_cupy + from ptypy.engines.ML import Regul_del2 + + +class EngineUtilsTest(CupyCudaTest): + + def test_regul_del2_grad_unity(self): + ## Arrange + A = (np.random.randn(40,40) + +1j*np.random.randn(40,40)).astype(np.complex64) + A_dev = cp.asarray(A) + + ## Act + Reg = Regul_del2(0.1) + Reg_dev = Regul_del2_cupy(0.1) + grad_dev = Reg_dev.grad(A_dev).get() + grad = Reg.grad(A) + #grad_dev = grad + ## Assert + np.testing.assert_allclose(grad_dev, grad, rtol=1e-7) + np.testing.assert_allclose(Reg_dev.LL, Reg.LL, rtol=1e-7) + + + def test_regul_del2_coeff_unity(self): + ## Arrange + A = (np.random.randn(40,40) + +1j*np.random.randn(40,40)).astype(np.complex64) + B = (np.random.randn(40,40) + +1j*np.random.randn(40,40)).astype(np.complex64) + A_dev = cp.asarray(A) + B_dev = cp.asarray(B) + + ## Act + Reg = Regul_del2(0.1) + Reg_dev = Regul_del2_cupy(0.1) + d = Reg_dev.poly_line_coeffs(A_dev, B_dev) + c = Reg.poly_line_coeffs(A, B) + #grad_dev = grad + #d = c + ## Assert + np.testing.assert_allclose(c, d, rtol=1e-6) diff --git a/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py new file mode 100644 index 000000000..00d785859 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py @@ -0,0 +1,204 @@ +''' + + +''' + +import unittest +import numpy as np +from . import CupyCudaTest, have_cupy + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda, FFT_cupy + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +def get_forward_cuFFT(f, stream, + pre_fft, post_fft, inplace, + symmetric, external=True): + if external: + return FFT_cuda(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace, + symmetric=symmetric, forward=True).ft + else: + return FFT_cupy(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace, + symmetric=symmetric, forward=True).ft + +def get_reverse_cuFFT(f, stream, + pre_fft, post_fft, inplace, + symmetric, external=True): + if external: + return FFT_cuda(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace, + symmetric=symmetric, forward=False).ift + else: + return FFT_cupy(f, stream, pre_fft=pre_fft, post_fft=post_fft, inplace=inplace, + symmetric=symmetric, forward=False).ift + + + +class FftScalingTest(CupyCudaTest): + + def get_input(self): + rows = cols = 32 + batches = 1 + f = np.ones(shape=(batches, rows, cols), dtype=COMPLEX_TYPE) + return f + + #### Trivial foward transform tests #### + + def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True): + f = self.get_input() + f_d = cp.asarray(f) + if preffact is not None: + pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64) + pref_d = cp.asarray(pref) + else: + preffact=1.0 + pref_d = None + if postfact is not None: + post = postfact * np.ones(shape=f.shape[-2:], dtype=np.complex64) + post_d = cp.asarray(post) + else: + postfact=1.0 + post_d = None + ft = factory(f, self.stream, + pre_fft=pref_d, post_fft=post_d, inplace=True, + symmetric=symmetric, external=external) + ft(f_d, f_d) + f_back = f_d.get() + elements = f.shape[-2] * f.shape[-1] + scale = 1.0 if not symmetric else 1.0 / np.sqrt(elements) + expected = elements * scale * preffact * postfact + self.assertAlmostEqual(f_back[0,0,0], expected) + np.testing.assert_array_almost_equal(f_back.flat[1:], 0) + + def test_fwd_noscale_cufft(self): + self.fwd_test(False, get_forward_cuFFT) + + def test_fwd_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, external=False) + + def test_fwd_scale_cufft(self): + self.fwd_test(True, get_forward_cuFFT) + + def test_fwd_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, external=False) + + def test_prefilt_fwd_noscale_cufft(self): + self.fwd_test(False, get_forward_cuFFT, preffact=2.0) + + def test_prefilt_fwd_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, preffact=2.0, external=False) + + def test_prefilt_fwd_scale_cufft(self): + self.fwd_test(True, get_forward_cuFFT, preffact=2.0) + + def test_prefilt_fwd_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, preffact=2.0, external=False) + + def test_postfilt_fwd_noscale_cufft(self): + self.fwd_test(False, get_forward_cuFFT, postfact=2.0) + + def test_postfilt_fwd_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, postfact=2.0, external=False) + + def test_postfilt_fwd_scale_cufft(self): + self.fwd_test(True, get_forward_cuFFT, postfact=2.0) + + def test_postfilt_fwd_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, postfact=2.0, external=False) + + def test_prepostfilt_fwd_noscale_cufft(self): + self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5) + + def test_prepostfilt_fwd_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False) + + def test_prepostfilt_fwd_scale_cufft(self): + self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5) + + def test_prepostfilt_fwd_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False) + + + ############# Trivial inverse transform tests ######### + + def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True): + f = self.get_input() + f_d = cp.asarray(f) + if preffact is not None: + pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64) + pref_d = cp.asarray(pref) + else: + preffact=1.0 + pref_d = None + if postfact is not None: + post = postfact * np.ones(shape=f.shape[-2:], dtype=np.complex64) + post_d = cp.asarray(post) + else: + postfact=1.0 + post_d = None + ift = factory(f, self.stream, + pre_fft=pref_d, post_fft=post_d, inplace=True, symmetric=symmetric, + external=external) + ift(f_d, f_d) + f_back = f_d.get() + elements = f.shape[-2] * f.shape[-1] + scale = 1.0 if not symmetric else np.sqrt(elements) + expected = scale * preffact * postfact + self.assertAlmostEqual(f_back[0,0,0], expected) + np.testing.assert_array_almost_equal(f_back.flat[1:], 0) + + + def test_rev_noscale_cufft(self): + self.rev_test(False, get_reverse_cuFFT) + + def test_rev_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, external=False) + + def test_rev_scale_cufft(self): + self.rev_test(True, get_reverse_cuFFT) + + def test_rev_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, external=False) + + def test_prefilt_rev_noscale_cufft(self): + self.rev_test(False, get_reverse_cuFFT, preffact=1.5) + + def test_prefilt_rev_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, preffact=1.5, external=False) + + def test_prefilt_rev_scale_cufft(self): + self.rev_test(True, get_reverse_cuFFT, preffact=1.5) + + def test_prefilt_rev_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, preffact=1.5, external=False) + + def test_postfilt_rev_noscale_cufft(self): + self.rev_test(False, get_reverse_cuFFT, postfact=1.5) + + def test_postfilt_rev_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, postfact=1.5, external=False) + + def test_postfilt_rev_scale_cufft(self): + self.rev_test(True, get_reverse_cuFFT, postfact=1.5) + + def test_postfilt_rev_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, postfact=1.5, external=False) + + def test_prepostfilt_rev_noscale_cufft(self): + self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0) + + def test_prepostfilt_rev_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False) + + def test_prepostfilt_rev_scale_cufft(self): + self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0) + + def test_prepostfilt_rev_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py b/test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py new file mode 100644 index 000000000..ab6c92830 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/fft_setstream_test.py @@ -0,0 +1,97 @@ +import unittest +import numpy as np +from . import CupyCudaTest, have_cupy +import time + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.cufft import FFT_cuda as cuFFT + from ptypy.accelerate.cuda_cupy.cufft import FFT_cupy as cupyCuFFT + + COMPLEX_TYPE = np.complex64 + FLOAT_TYPE = np.float32 + INT_TYPE = np.int32 + +class FftSetStreamTest(CupyCudaTest): + + def helper(self, FFT): + f = np.ones(shape=(200, 128, 128), dtype=COMPLEX_TYPE) + t1 = time.time() + FW = FFT(f, self.stream, pre_fft=None, post_fft=None, inplace=True, + symmetric=True) + t2 = time.time() + dur1 = t2 - t1 + with self.stream: + f_dev = cp.asarray(f) + self.stream.synchronize() + + # measure with events to make sure that something actually + # happened in the right stream + with self.stream: + ev1 = cp.cuda.Event() + ev2 = cp.cuda.Event() + rt1 = time.time() + ev1.record() + FW.ft(f_dev, f_dev) + with self.stream: + ev2.record() + ev1.synchronize() + ev2.synchronize() + self.stream.synchronize() + gput = cp.cuda.get_elapsed_time(ev1, ev2)*1e-3 + rt2 = time.time() + cput = rt2-rt1 + rel = 1-gput/cput + + print('Origial: CPU={}, GPU={}, reldiff={}'.format(cput, gput, rel)) + + self.assertEqual(self.stream, FW.queue) + self.assertLess(rel, 0.3) # max 30% diff + + stream2 = cp.cuda.Stream() + + measure = False # measure time to set the stream + if measure: + avg = 100 + else: + avg = 1 + t1 = time.time() + for i in range(avg): + FW.queue = stream2 + stream2.synchronize() + t2 = time.time() + dur2 = (t2 - t1)/avg + + with stream2: + ev1 = cp.cuda.Event() + ev2 = cp.cuda.Event() + rt1 = time.time() + ev1.record() + FW.ft(f_dev, f_dev) + with stream2: + ev2.record() + ev1.synchronize() + ev2.synchronize() + stream2.synchronize() + gput = cp.cuda.get_elapsed_time(ev1, ev2)*1e-3 + self.stream.synchronize() + rt2 = time.time() + cput = rt2-rt1 + rel = 1 - gput/cput + + print('New: CPU={}, GPU={}, reldiff={}'.format(cput, gput, rel)) + + self.assertEqual(stream2, FW.queue) + self.assertLess(rel, 0.3) # max 30% diff + + if measure: + print('initial: {}, set_stream: {}'.format(dur1, dur2)) + assert False + + + + def test_set_stream_b_cufft(self): + self.helper(cuFFT) + + def test_set_stream_c_cupy_cufft(self): + self.helper(cupyCuFFT) diff --git a/test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py new file mode 100644 index 000000000..18fa82aa8 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/fourier_update_kernel_test.py @@ -0,0 +1,685 @@ +''' + + +''' + +import unittest +import numpy as np +from . import CupyCudaTest, have_cupy + + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.kernels import FourierUpdateKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class FourierUpdateKernelTest(CupyCudaTest): + + + def test_fmag_all_update_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number og object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + # print("address book is:") + # print(repr(addr)) + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + err_fmag = np.zeros(N, dtype=FLOAT_TYPE) + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + pbound_set = 0.9 + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_error(f, addr, fmag, mask, mask_sum) + nFUK.error_reduce(addr, err_fmag) + # print(np.sqrt(pbound_set/err_fmag)) + f_d = cp.asarray(f) + fmag_d = cp.asarray(fmag) + mask_d = cp.asarray(mask) + err_fmag_d = cp.asarray(err_fmag) + addr_d = cp.asarray(addr) + + # now set the state for both. + + FUK.gpu.fdev = cp.asarray(nFUK.npy.fdev) + FUK.gpu.ferr = cp.asarray(nFUK.npy.ferr) + + FUK.fmag_all_update(f_d, addr_d, fmag_d, mask_d, err_fmag_d, pbound=pbound_set) + + + nFUK.fmag_all_update(f, addr, fmag, mask, err_fmag, pbound=pbound_set) + expected_f = f + measured_f = f_d.get() + np.testing.assert_allclose(expected_f, measured_f, rtol=1e-6, err_msg="Numpy f " + "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" % (repr(expected_f), + repr(measured_f), + repr(mask))) + + def test_fmag_update_nopbound_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number og object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE)# the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + # print("address book is:") + # print(repr(addr)) + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + err_fmag = np.zeros(N, dtype=FLOAT_TYPE) + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_error(f, addr, fmag, mask, mask_sum) + nFUK.error_reduce(addr, err_fmag) + # print(np.sqrt(pbound_set/err_fmag)) + f_d = cp.asarray(f) + fmag_d = cp.asarray(fmag) + mask_d = cp.asarray(mask) + addr_d = cp.asarray(addr) + + # now set the state for both. + + FUK.gpu.fdev = cp.asarray(nFUK.npy.fdev) + FUK.gpu.ferr = cp.asarray(nFUK.npy.ferr) + + FUK.fmag_update_nopbound(f_d, addr_d, fmag_d, mask_d) + nFUK.fmag_update_nopbound(f, addr, fmag, mask) + + expected_f = f + measured_f = f_d.get() + np.testing.assert_allclose(measured_f, expected_f, rtol=1e-6, err_msg="Numpy f " + "is \n%s, \nbut gpu f is \n %s, \n mask is:\n %s \n" % (repr(expected_f), + repr(measured_f), + repr(mask))) + + + def test_fourier_error_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number of object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), + dtype=FLOAT_TYPE) # the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + f_d = cp.asarray(f) + fmag_d = cp.asarray(fmag) + mask_d = cp.asarray(mask) + addr_d = cp.asarray(addr) + mask_sum_d = cp.asarray(mask_sum) + + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_error(f, addr, fmag, mask, mask_sum) + FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d) + + expected_fdev = nFUK.npy.fdev + measured_fdev = FUK.gpu.fdev.get() + np.testing.assert_allclose(expected_fdev, measured_fdev, rtol=1e-6, err_msg="Numpy fdev " + "is \n%s, \nbut gpu fdev is \n %s, \n " % ( + repr(expected_fdev), + repr(measured_fdev))) + + expected_ferr = nFUK.npy.ferr + measured_ferr = FUK.gpu.ferr.get() + + np.testing.assert_array_equal(expected_ferr, measured_ferr, err_msg="Numpy ferr" + "is \n%s, \nbut gpu ferr is \n %s, \n " % ( + repr(expected_ferr), + repr(measured_ferr))) + def test_fourier_deviation_UNITY(self): + ''' + setup - using the fourier_error as reference, so we need mask, etc. + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number of object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), + dtype=FLOAT_TYPE) # the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + f_d = cp.asarray(f) + fmag_d = cp.asarray(fmag) + addr_d = cp.asarray(addr) + + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_deviation(f, addr, fmag) + FUK.fourier_deviation(f_d, addr_d, fmag_d) + + expected_fdev = nFUK.npy.fdev + measured_fdev = FUK.gpu.fdev.get() + np.testing.assert_allclose(measured_fdev, expected_fdev, rtol=1e-6, err_msg="Numpy fdev " + "is \n%s, \nbut gpu fdev is \n %s, \n " % ( + repr(expected_fdev), + repr(measured_fdev))) + + + + def test_error_reduce_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number og object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape).item()).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), + dtype=FLOAT_TYPE) # the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + # print("address book is:") + # print(repr(addr)) + + ''' + test + ''' + err_fmag = np.zeros(N, dtype=FLOAT_TYPE) + mask_sum = mask.sum(-1).sum(-1) + + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + f_d = cp.asarray(f) + fmag_d = cp.asarray(fmag) + mask_d = cp.asarray(mask) + addr_d = cp.asarray(addr) + err_fmag_d = cp.asarray(err_fmag) + mask_sum_d = cp.asarray(mask_sum) + pbound_set = 0.9 + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + FUK = FourierUpdateKernel(f, nmodes=total_number_modes, queue_thread=self.stream) + + nFUK.allocate() + FUK.allocate() + + nFUK.fourier_error(f, addr, fmag, mask, mask_sum) + nFUK.error_reduce(addr, err_fmag) + + + FUK.fourier_error(f_d, addr_d, fmag_d, mask_d, mask_sum_d) + FUK.error_reduce(addr_d, err_fmag_d) + + expected_err_fmag = err_fmag + measured_err_fmag = err_fmag_d.get() + + np.testing.assert_allclose(expected_err_fmag, measured_err_fmag, rtol=1.15207385e-07, + err_msg="Numpy err_fmag" + "is \n%s, \nbut gpu err_fmag is \n %s, \n " % ( + repr(expected_err_fmag), + repr(measured_err_fmag))) + + def test_error_reduce(self): + # array from the previous test + ferr = np.array([[[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [7.54033208e-01, 3.04839879e-01, 5.56465909e-02, 6.45330548e-03, 1.57260016e-01], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [5.26210022e+00, 6.81290817e+00, 8.56371498e+00, 1.05145216e+01, 1.26653280e+01], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], + + [[1.61048353e+00, 2.15810299e+00, 2.78572226e+00, 3.49334168e+00, 4.28096104e+00], + [5.14858055e+00, 6.09619951e+00, 7.12381887e+00, 8.23143768e+00, 9.41905785e+00], + [1.06866770e+01, 1.20342960e+01, 1.34619150e+01, 1.49695349e+01, 1.65571537e+01], + [1.82247734e+01, 1.99723930e+01, 2.18000126e+01, 2.37076321e+01, 2.56952515e+01], + [2.77628708e+01, 2.99104881e+01, 3.21381073e+01, 3.44457283e+01, 3.68333473e+01]], + + [[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [6.31699409e+01, 6.82966690e+01, 7.36233978e+01, 7.91501160e+01, 8.48768463e+01], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00], + [1.23437180e+02, 1.30563919e+02, 1.37890640e+02, 1.45417374e+02, 1.53144089e+02], + [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00]], + + [[4.58764343e+01, 4.86257210e+01, 5.14550095e+01, 5.43642960e+01, 5.73535805e+01], + [6.04228668e+01, 6.35721550e+01, 6.68014374e+01, 7.01107254e+01, 7.35000076e+01], + [7.69692993e+01, 8.05185852e+01, 8.41478729e+01, 8.78571548e+01, 9.16464386e+01], + [9.55157242e+01, 9.94650116e+01, 1.03494293e+02, 1.07603584e+02, 1.11792870e+02], + [1.16062157e+02, 1.20411446e+02, 1.24840721e+02, 1.29350006e+02, 1.33939301e+02]]], + dtype=FLOAT_TYPE) + # print(ferr.shape) + scan_pts = 2 # one dimensional scan point number + N = scan_pts ** 2 + + addr = np.zeros((N, 1, 5, 3)) + aux = np.zeros((4, 5, 5)) + FUK = FourierUpdateKernel(aux, nmodes=1) + err_mag = np.zeros(N, dtype=FLOAT_TYPE) + err_mag_d = cp.asarray(err_mag) + FUK.gpu.ferr = cp.asarray(ferr) + addr_d = cp.asarray(addr) + + FUK.error_reduce(addr_d, err_mag_d) + + # print(repr(ferr)) + measured_err_mag = err_mag_d.get() + + # print(repr(measured_err_mag)) + + expected_err_mag = np.array([45.096806, 388.54788, 1059.5702, 2155.6968], dtype=FLOAT_TYPE) + + np.testing.assert_array_equal(expected_err_mag, measured_err_mag, err_msg="The fourier_update_kernel.error_reduce" + "is not behaving as expected.") + + + def log_likelihood_UNITY_tester(self, use_version2=False): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number of object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + + mask = np.empty(shape=(N, B, C), + dtype=FLOAT_TYPE) # the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + LLerr = np.zeros_like(mask_sum, dtype=np.float32) + f_d = cp.asarray(f) + fmag_d = cp.asarray(fmag) + mask_d = cp.asarray(mask) + addr_d = cp.asarray(addr) + LLerr_d = cp.asarray(LLerr) + + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + nFUK = npFourierUpdateKernel(f, nmodes=total_number_modes) + nFUK.allocate() + nFUK.log_likelihood(f, addr, fmag, mask, LLerr) + + FUK = FourierUpdateKernel(f, nmodes=total_number_modes) + FUK.allocate() + if use_version2: + FUK.log_likelihood2(f_d, addr_d, fmag_d, mask_d, LLerr_d) + else: + FUK.log_likelihood(f_d, addr_d, fmag_d, mask_d, LLerr_d) + + expected_err_phot = LLerr + measured_err_phot = LLerr_d.get() + + np.testing.assert_allclose(expected_err_phot, measured_err_phot, err_msg="Numpy log-likelihood error " + "is \n%s, \nbut gpu log-likelihood error is \n%s, \n " % ( + repr(expected_err_phot), + repr(measured_err_phot)), rtol=1e-5) + def test_log_likelihood_UNITY(self): + self.log_likelihood_UNITY_tester(False) + + def test_log_likelihood2_UNITY(self): + self.log_likelihood_UNITY_tester(True) + + def test_exit_error_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number of object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + aux = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + aux[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import FourierUpdateKernel as npFourierUpdateKernel + aux_d = cp.asarray(aux) + addr_d = cp.asarray(addr) + + nFUK = npFourierUpdateKernel(aux, nmodes=total_number_modes) + FUK = FourierUpdateKernel(aux, nmodes=total_number_modes) + + nFUK.allocate() + FUK.allocate() + + nFUK.exit_error(aux, addr, ) + FUK.exit_error(aux_d, addr_d) + + expected_ferr = nFUK.npy.ferr + measured_ferr = FUK.gpu.ferr.get() + + np.testing.assert_allclose(expected_ferr, measured_ferr, err_msg="Numpy ferr" + "is \n%s, \nbut gpu ferr is \n %s, \n " % ( + repr(expected_ferr), + repr(measured_ferr)), rtol=1e-7) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/accelerate_tests/cuda_cupy_tests/gpudata_test.py b/test/accelerate_tests/cuda_cupy_tests/gpudata_test.py new file mode 100644 index 000000000..5d1be4f8d --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/gpudata_test.py @@ -0,0 +1,265 @@ +''' +''' + +import unittest +import numpy as np +from . import CupyCudaTest, have_cupy + +if have_cupy(): + import cupy as cp + import cupyx + from ptypy.accelerate.cuda_cupy.mem_utils import GpuData, GpuDataManager + +class GpuDataTest(CupyCudaTest): + + def test_to_gpu_new(self): + # arrange + cpu = 2. * np.ones((5,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + + # act + gpu = gdata.to_gpu(cpu, '1', self.stream) + self.stream.synchronize() + + # assert + np.testing.assert_array_equal(cpu, gpu.get()) + + def test_to_gpu_sameid(self): + # arrange + cpu = 2. * np.ones((5,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + + # act + gpu1 = gdata.to_gpu(cpu, '1', self.stream) + cpu *= 2. + gpu2 = gdata.to_gpu(cpu, '1', self.stream) + self.stream.synchronize() + + # assert + np.testing.assert_array_equal(gpu1.get(), gpu2.get()) + + def test_to_gpu_new_syncback(self): + # arrange + cpu = 2. * np.ones((5,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=True) + + # act + gpu1 = gdata.to_gpu(cpu, '1', self.stream) + with self.stream: + gpu1.fill(np.float32(3.)) + cpu2 = 2. * cpu + gpu2 = gdata.to_gpu(cpu2, '2', self.stream) + self.stream.synchronize() + + # assert + np.testing.assert_array_equal(cpu, 3.) + np.testing.assert_array_equal(gpu2.get(), cpu2) + + def test_to_gpu_new_nosyncback(self): + # arrange + cpu = 2. * np.ones((5,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + + # act + gpu1 = gdata.to_gpu(cpu, '1', self.stream) + with self.stream: + gpu1.fill(np.float32(3.)) + cpu2 = 2. * cpu + gpu2 = gdata.to_gpu(cpu2, '2', self.stream) + self.stream.synchronize() + + # assert + np.testing.assert_array_equal(cpu, 2.) + np.testing.assert_array_equal(gpu2.get(), cpu2) + + def test_from_gpu(self): + # arrange + cpu = 2. * np.ones((5,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + + # act + gpu1 = gdata.to_gpu(cpu, '1', self.stream) + with self.stream: + gpu1.fill(np.float32(3.)) + gdata.from_gpu(self.stream) + self.stream.synchronize() + + def test_data_variable_size(self): + # arrange + cpu = np.ones((2,5), dtype=np.float32) + cpu2 = 2. * np.ones((1,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + + # act + gpu = gdata.to_gpu(cpu, '1', self.stream) + gpu2 = gdata.to_gpu(cpu2, '2', self.stream) + self.stream.synchronize() + + # assert + np.testing.assert_array_equal(gpu2.get(), cpu2) + self.assertEqual(cpu2.nbytes, gpu2.nbytes) + np.testing.assert_array_equal(gpu.get(), np.array([ + [2, 2, 2, 2, 2], + [1, 1, 1, 1, 1] + ], dtype=np.float32)) + + def test_data_variable_size_raise(self): + # arrange + cpu = np.ones((1,5), dtype=np.float32) + cpu2 = np.ones((2,4), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + + # act/assert + with self.assertRaises(Exception): + gdata.to_gpu(cpu2, '1', self.stream) + + def test_data_resize_raise(self): + # arrange + cpu = np.ones((5,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + gpu = gdata.to_gpu(cpu, '1', self.stream) + cpu2 = np.ones((10,5), dtype=np.float32) + + # act + gdata.resize(cpu2.nbytes) + gpu2 = gdata.to_gpu(cpu2, '1', self.stream) + + # assert + self.assertEqual(gdata.gpuId, '1') + self.assertEqual(gdata.nbytes, cpu2.nbytes) + self.assertEqual(gpu2.size, cpu2.size) + self.assertGreaterEqual(gdata.nbytes_buffer, cpu2.nbytes) + + def test_data_resize_shrink(self): + # arrange + cpu = np.ones((5,5), dtype=np.float32) + gdata = GpuData(cpu.nbytes, syncback=False) + gpu = gdata.to_gpu(cpu, '1', self.stream) + cpu2 = np.ones((4,6), dtype=np.float32) + + # act + gdata.resize(cpu2.nbytes) + gpu2 = gdata.to_gpu(cpu2, '1', self.stream) + + # assert + self.assertEqual(gdata.gpuId, '1') + self.assertEqual(gdata.nbytes, cpu2.nbytes) + self.assertEqual(gpu2.size, cpu2.size) + self.assertGreaterEqual(gdata.nbytes_buffer, cpu2.nbytes) + + def test_datamanager_memory(self): + # arrange / act + gdm = GpuDataManager(128, 4) + gdm.reset(124, 3) + + # assert + self.assertEqual(gdm.memory, 3*128) + self.assertEqual(gdm.nbytes, 124) + + def test_datamanager_free(self): + # arrange + gdm = GpuDataManager(128, 2) + + # act + gdm.free() + + # assert + self.assertEqual(gdm.memory, 0) + + def test_datamanager_newids(self): + # arrange + cpu1 = 2. * np.ones((5,5), dtype=np.float32) + cpu2 = 2. * cpu1 # 4 + cpu3 = 2. * cpu2 # 8 + cpu4 = 2. * cpu3 # 16 + gdm = GpuDataManager(cpu1.nbytes, 4, syncback=False) + + # act + gpu1 = gdm.to_gpu(cpu1, '1', self.stream)[1] + gpu2 = gdm.to_gpu(cpu2, '2', self.stream)[1] + gpu11 = gdm.to_gpu(-1.*cpu1, '1', self.stream)[1] + gpu21 = gdm.to_gpu(-1.*cpu4, '2', self.stream)[1] + gpu3 = gdm.to_gpu(cpu3, '3', self.stream)[1] + gpu31 = gdm.to_gpu(-1.*cpu1, '3', self.stream)[1] + gpu4 = gdm.to_gpu(cpu4, '4', self.stream)[1] + gpu41 = gdm.to_gpu(-1.*cpu1, '4', self.stream)[1] + self.stream.synchronize() + + # assert + np.testing.assert_array_equal(cpu1, gpu1.get()) + np.testing.assert_array_equal(cpu1, gpu11.get()) + np.testing.assert_array_equal(cpu1, 2.) + np.testing.assert_array_equal(cpu2, gpu2.get()) + np.testing.assert_array_equal(cpu2, gpu21.get()) + np.testing.assert_array_equal(cpu2, 4.) + np.testing.assert_array_equal(cpu3, gpu3.get()) + np.testing.assert_array_equal(cpu3, gpu31.get()) + np.testing.assert_array_equal(cpu3, 8.) + np.testing.assert_array_equal(cpu4, gpu4.get()) + np.testing.assert_array_equal(cpu4, gpu41.get()) + np.testing.assert_array_equal(cpu4, 16.) + + def test_datamanager_syncback(self): + # arrange + cpu1 = 2. * np.ones((5,5), dtype=np.float32) + cpu2 = 2. * cpu1 # 4 + cpu3 = 2. * cpu2 # 8 + cpu4 = 2. * cpu3 # 16 + gdm = GpuDataManager(cpu1.nbytes, 2, syncback=True) + + # act + gpu1 = gdm.to_gpu(cpu1, '1', self.stream)[1] + gpu2 = gdm.to_gpu(cpu2, '2', self.stream)[1] + with self.stream: + gpu1.fill(np.float32(3.)) + gpu2.fill(np.float32(5.)) + gpu3 = gdm.to_gpu(cpu3, '3', self.stream)[1] + with self.stream: + gpu3.fill(np.float32(7.)) + gpu4 = gdm.to_gpu(cpu4, '4', self.stream)[1] + with self.stream: + gpu4.fill(np.float32(9.)) + gdm.syncback = False + gpu5 = gdm.to_gpu(cpu4*.2, '5', self.stream)[1] + gpu6 = gdm.to_gpu(cpu4*.4, '6', self.stream)[1] + self.stream.synchronize() + + # assert + np.testing.assert_array_equal(cpu1, 3.) + np.testing.assert_array_equal(cpu2, 5.) + np.testing.assert_array_equal(cpu3, 8.) + np.testing.assert_array_equal(cpu4, 16.) + + def test_data_synctransfer(self): + # arrange + sh = (1024, 1024, 1) # 4MB + cpu1 = cupyx.zeros_pinned(sh, np.float32, order="C") + cpu2 = cupyx.zeros_pinned(sh, np.float32, order="C") + cpu1[:] = 1. + cpu2[:] = 2. + gdata = GpuData(cpu1.nbytes, syncback=True) + # long-running kernel + knl = """ + extern "C" __global__ void tfill(float* d, int sz, float dval) { + for (int i = 0; i < sz; ++i) + d[i] = dval; + } + """ + tfill = cp.RawKernel(knl, "tfill") + + # act + s2 = cp.cuda.Stream() + gpu1 = gdata.to_gpu(cpu1, '1', self.stream) + with s2: + tfill(grid=(1,1,1), block=(1,1,1), args=(gpu1, np.int32(gpu1.size), np.float32(2.))) + gdata.record_done(self.stream) # it will fail without this + gpu2 = gdata.to_gpu(cpu2, '2', s2) + with s2: + tfill(grid=(1,1,1), block=(1,1,1), args=(gpu1, np.int32(gpu2.size), np.float32(4.))) + gdata.from_gpu(s2) + self.stream.synchronize() + s2.synchronize() + + # assert + np.testing.assert_array_equal(cpu1, 2.) + np.testing.assert_array_equal(cpu2, 4.) diff --git a/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py new file mode 100644 index 000000000..b3a6e8ff7 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/gradient_descent_kernel_test.py @@ -0,0 +1,327 @@ +''' + + +''' + +import unittest +import numpy as np +from . import perfrun, CupyCudaTest, have_cupy + + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.kernels import GradientDescentKernel + + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + + +class GradientDescentKernelTest(CupyCudaTest): + + def prepare_arrays(self, performance=False): + if not performance: + nmodes = 2 + N_buf = 4 + N = 3 + A = 3 + else: + nmodes = 4 + N_buf = 100 + N = 80 + A = 512 + i_sh = (N, A, A) + e_sh = (N*nmodes, A, A) + f_sh = (N_buf, A, A) + a_sh = (N_buf * nmodes, A, A) + w = np.ones(i_sh, dtype=FLOAT_TYPE) + for idx, sl in enumerate(w): + sl[idx % A, idx % A] = 0.0 + X, Y, Z = np.indices(a_sh, dtype=COMPLEX_TYPE) + b_f = X + 1j * Y + b_a = Y + 1j * Z + b_b = Z + 1j * X + err_sum = np.zeros((N,), dtype=FLOAT_TYPE) + fic = np.ones((N,), dtype=FLOAT_TYPE) + addr = np.zeros((N, nmodes, 5, 3), dtype=INT_TYPE) + I = np.empty(i_sh, dtype=FLOAT_TYPE) + I[:] = np.round(np.abs(b_f[:N])**2 % 20) + for pos_idx in range(N): + for mode_idx in range(nmodes): + exit_idx = pos_idx * nmodes + mode_idx + addr[pos_idx, mode_idx] = np.array([[mode_idx, 0, 0], + [0, 0, 0], + [exit_idx, 0, 0], + [pos_idx, 0, 0], + [pos_idx, 0, 0]], dtype=INT_TYPE) + return (cp.asarray(b_f), + cp.asarray(b_a), + cp.asarray(b_b), + cp.asarray(I), + cp.asarray(w), + cp.asarray(err_sum), + cp.asarray(addr), + cp.asarray(fic)) + + def test_allocate(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + + def test_make_model(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() + + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_model(b_f, addr) + + exp_Imodel = np.array([[[1., 1., 1.], + [3., 3., 3.], + [9., 9., 9.]], + + [[13., 13., 13.], + [15., 15., 15.], + [21., 21., 21.]], + + [[41., 41., 41.], + [43., 43., 43.], + [49., 49., 49.]], + + [[85., 85., 85.], + [87., 87., 87.], + [93., 93., 93.]]], dtype=FLOAT_TYPE) + + np.testing.assert_array_almost_equal( + exp_Imodel, GDK.gpu.Imodel.get(), + err_msg="`Imodel` buffer has not been updated as expected") + + @unittest.skipIf(not perfrun, "performance test") + def test_make_model_performance(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) + + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_model(b_f, addr) + + def test_floating_intensity(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() + GDK=GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.gpu.Imodel[0] = I[0] * 3. + GDK.gpu.Imodel[1] = I[1] * 2. + GDK.gpu.Imodel[2] = I[2] + GDK.floating_intensity(addr, w, I, fic) + #print('Imodel',repr(GDK.gpu.Imodel)) + #print('fic',repr(1./fic)) + exp_Imodel = np.array([[[0., 0., 0.], + [1., 1., 1.], + [4., 4., 4.]], + + [[1., 1., 1.], + [2., 2., 2.], + [5., 5., 5.]], + + [[4., 4., 4.], + [5., 5., 5.], + [8., 8., 8.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=np.float32) + exp_fic=1./np.array([3., 2., 1.], dtype=np.float32) + np.testing.assert_array_almost_equal(exp_Imodel, GDK.gpu.Imodel.get(), + err_msg="`Imodel` buffer has not been updated as expected") + np.testing.assert_array_almost_equal(exp_fic, fic.get(), + err_msg="floating intensity coeff (fic) has not been updated as expected") + + def test_make_a012(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_a012(b_f, b_a, b_b, addr, I, fic) + + exp_A0 = np.array([[[1., 1., 1.], + [2., 2., 2.], + [5., 5., 5.]], + + [[12., 12., 12.], + [13., 13., 13.], + [16., 16., 16.]], + + [[37., 37., 37.], + [38., 38., 38.], + [41., 41., 41.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A0, GDK.gpu.Imodel.get(), + err_msg="`Imodel` buffer (=A0) has not been updated as expected") + + exp_A1 = np.array([[[0., 0., 0.], + [2., 6., 10.], + [4., 12., 20.]], + + [[0., 0., 0.], + [10., 14., 18.], + [20., 28., 36.]], + + [[0., 0., 0.], + [18., 22., 26.], + [36., 44., 52.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A1, GDK.gpu.LLerr.get(), + err_msg="`LLerr` buffer (=A1) has not been updated as expected") + + exp_A2 = np.array([[[0., 4., 12.], + [4., 8., 16.], + [12., 16., 24.]], + + [[0., 12., 28.], + [12., 24., 40.], + [28., 40., 56.]], + + [[0., 20., 44.], + [20., 40., 64.], + [44., 64., 88.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_A2, GDK.gpu.LLden.get(), + err_msg="`LLden` buffer (=A2) has not been updated as expected") + + @unittest.skipIf(not perfrun, "performance test") + def test_make_a012_performance(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) + + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_a012(b_f, b_a, b_b, addr, I, fic) + + def test_fill_b(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() + Brenorm = 0.35 + B = np.zeros((3,), dtype=FLOAT_TYPE) + B_dev = cp.asarray(B) + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_a012(b_f, b_a, b_b, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B_dev) + B[:] = B_dev.get() + + exp_B = np.array([ 4699.8, 5398.4, 13398.], dtype=FLOAT_TYPE) + np.testing.assert_allclose( + B, exp_B, + rtol=1e-7, + err_msg="`B` has not been updated as expected") + + @unittest.skipIf(not perfrun, "performance test") + def test_fill_b_perf(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) + Brenorm = 0.35 + B = np.zeros((3,), dtype=FLOAT_TYPE) + B_dev = cp.asarray(B) + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.make_a012(b_f, b_a, b_b, addr, I, fic) + GDK.fill_b(addr, Brenorm, w, B_dev) + + def test_error_reduce(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.npy.LLerr = np.indices(GDK.gpu.LLerr.shape, dtype=FLOAT_TYPE)[0] + GDK.gpu.LLerr = cp.asarray(GDK.npy.LLerr) + GDK.error_reduce(addr, err_sum) + + exp_err = np.array([0., 9., 18.], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_err, err_sum.get(), + err_msg="`err_sum` has not been updated as expected") + + @unittest.skipIf(not perfrun, "performance test") + def test_error_reduce_perf(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.npy.LLerr = np.indices(GDK.gpu.LLerr.shape, dtype=FLOAT_TYPE)[0] + GDK.gpu.LLerr = cp.asarray(GDK.npy.LLerr) + GDK.error_reduce(addr, err_sum) + + def test_main(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays() + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.main(b_f, addr, w, I) + + exp_b_f = np.array([[[0. + 0.j, 0. + 0.j, 0. + 0.j], + [-0. - 1.j, -0. - 1.j, -0. - 1.j], + [-0. - 8.j, -0. - 8.j, -0. - 8.j]], + + [[0. + 0.j, 0. + 0.j, 0. + 0.j], + [-1. - 1.j, -1. - 1.j, -1. - 1.j], + [-4. - 8.j, -4. - 8.j, -4. - 8.j]], + + [[-2. + 0.j, -2. + 0.j, -2. + 0.j], + [-4. - 2.j, -0. + 0.j, -4. - 2.j], + [-10.-10.j, -10.-10.j, -10.-10.j]], + + [[-3. + 0.j, -3. + 0.j, -3. + 0.j], + [-6. - 2.j, -0. + 0.j, -6. - 2.j], + [-15.-10.j, -15.-10.j, -15.-10.j]], + + [[-16. + 0.j, -16. + 0.j, -16. + 0.j], + [-20. - 5.j, -20. - 5.j, -20. - 5.j], + [-32.-16.j, -32.-16.j, -0. + 0.j]], + + [[-20. + 0.j, -20. + 0.j, -20. + 0.j], + [-25. - 5.j, -25. - 5.j, -25. - 5.j], + [-40.-16.j, -40.-16.j, -0. + 0.j]], + + [[6. + 0.j, 6. + 0.j, 6. + 0.j], + [6. + 1.j, 6. + 1.j, 6. + 1.j], + [6. + 2.j, 6. + 2.j, 6. + 2.j]], + + [[7. + 0.j, 7. + 0.j, 7. + 0.j], + [7. + 1.j, 7. + 1.j, 7. + 1.j], + [7. + 2.j, 7. + 2.j, 7. + 2.j]]], dtype=COMPLEX_TYPE) + np.testing.assert_array_almost_equal( + exp_b_f, b_f.get(), + err_msg="Auxiliary has not been updated as expected") + + exp_LL = np.array([[[0., 0., 0.], + [1., 1., 1.], + [16., 16., 16.]], + + [[1., 1., 1.], + [4., 0., 4.], + [25., 25., 25.]], + + [[16., 16., 16.], + [25., 25., 25.], + [64., 64., 0.]], + + [[0., 0., 0.], + [0., 0., 0.], + [0., 0., 0.]]], dtype=FLOAT_TYPE) + np.testing.assert_array_almost_equal( + exp_LL, GDK.gpu.LLerr.get(), + err_msg="LogLikelihood error has not been updated as expected") + + @unittest.skipIf(not perfrun, "performance test") + def test_main_perf(self): + b_f, b_a, b_b, I, w, err_sum, addr, fic = self.prepare_arrays(performance=True) + GDK = GradientDescentKernel(b_f, addr.shape[1]) + GDK.allocate() + GDK.main(b_f, addr, w, I) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/test/accelerate_tests/cuda_cupy_tests/import_test.py b/test/accelerate_tests/cuda_cupy_tests/import_test.py new file mode 100644 index 000000000..3af6dff3f --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/import_test.py @@ -0,0 +1,10 @@ +""" +Import test +""" +import unittest + +class AutoLoaderTest(unittest.TestCase): + + def test_load_engines_cupy(self): + import ptypy + ptypy.load_gpu_engines("cupy") diff --git a/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py new file mode 100644 index 000000000..0c234d878 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/multi_gpu_test.py @@ -0,0 +1,74 @@ +''' +''' + +import unittest +from mpi4py.MPI import Get_version +import numpy as np +from . import CupyCudaTest, have_cupy + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy import multi_gpu as mgpu + from ptypy.utils import parallel + +from pkg_resources import parse_version + +class GpuDataTest(CupyCudaTest): + """ + This is a test class for MPI - to really check if it all works, it needs + to be run as: + + mpirun -np 2 pytest multi_gpu_test.py + + For CUDA-aware MPI testing, currently the environment variable + + OMPI_MCA_opal_cuda_support=true + + needs to be set, mpi4py version 3.1.0+ used, and a cuda-aware MPI version. + """ + + def setUp(self): + if parallel.rank_local < cp.cuda.runtime.getDeviceCount(): + self.device = cp.cuda.Device(parallel.rank_local) + self.device.use() + else: + self.device = None + + @unittest.skipIf(parallel.rank != 0, "Only in MPI rank 0") + def test_version(self): + v1 = parse_version("3.1.0") + v2 = parse_version(parse_version("3.1.0a").base_version) + + self.assertGreaterEqual(v2, v1) + + def test_compute_mode(self): + attr = cp.cuda.Device().attributes + self.assertIn("ComputeMode", attr) + mode = attr["ComputeMode"] + self.assertIn(mode, [0, 1, 2, 3]) + + def multigpu_tester(self, com): + if self.device is None: + return + + data = np.ones((2, 1), dtype=np.float32) + data_dev = cp.asarray(data) + sz = parallel.size + com.allReduceSum(data_dev) + + out = data_dev.get() + np.testing.assert_allclose(out, sz * data, rtol=1e-6) + + def test_multigpu_auto(self): + self.multigpu_tester(mgpu.get_multi_gpu_communicator()) + + def test_multigpu_mpi(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorMpi()) + + @unittest.skipIf(not mgpu.have_cuda_mpi, "Cuda-aware MPI not available") + def test_multigpu_cudampi(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorCudaMpi()) + + @unittest.skipIf(not mgpu.have_nccl, "NCCL not available") + def test_multigpu_nccl(self): + self.multigpu_tester(mgpu.MultiGpuCommunicatorNccl()) \ No newline at end of file diff --git a/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py new file mode 100644 index 000000000..8a41bad35 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/po_update_kernel_test.py @@ -0,0 +1,943 @@ +''' + + +''' + +import unittest +import numpy as np +from . import CupyCudaTest, have_cupy +from ptypy.accelerate.base.array_utils import max_abs2 + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.kernels import PoUpdateKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + + +class PoUpdateKernelTest(CupyCudaTest): + + def prepare_arrays(self, scan_points=None): + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + if scan_points is None: + scan_pts = 2 # one dimensional scan point number + else: + scan_pts = scan_points + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) + for idx in range(G): + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + + probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) + for idx in range(D): + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + + return (cp.asarray(addr), + cp.asarray(object_array), + cp.asarray(object_array_denominator), + cp.asarray(probe), + cp.asarray(exit_wave), + cp.asarray(probe_denominator)) + + + def test_init(self): + POUK = PoUpdateKernel() + np.testing.assert_equal(POUK.kernels, ['pr_update', 'ob_update'], + err_msg='PoUpdateKernel does not have the correct functions registered.') + + def ob_update_REGRESSION_tester(self, atomics=True): + + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 2 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) + for idx in range(G): + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + + + POUK = PoUpdateKernel() + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + # print("object array denom before:") + # print(object_array_denominator) + object_array_dev = cp.asarray(object_array) + object_array_denominator_dev = cp.asarray(object_array_denominator) + probe_dev = cp.asarray(probe) + exit_wave_dev = cp.asarray(exit_wave) + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = cp.asarray(addr2) + else: + addr_dev = cp.asarray(addr) + + print(object_array_denominator) + POUK.ob_update(addr_dev, object_array_dev, object_array_denominator_dev, probe_dev, exit_wave_dev, atomics=atomics) + print("\n\n cuda version") + print(object_array_denominator_dev.get()) + nPOUK.ob_update(addr, object_array, object_array_denominator, probe, exit_wave) + print("\n\n numpy version") + print(object_array_denominator) + + + + expected_object_array = np.array([[[15.+1.j, 53.+1.j, 53.+1.j, 53.+1.j, 53.+1.j, 39.+1.j, 1.+1.j], + [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j], + [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j], + [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j], + [77.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 201.+1.j, 125.+1.j, 1.+1.j], + [63.+1.j, 149.+1.j, 149.+1.j, 149.+1.j, 149.+1.j, 87.+1.j, 1.+1.j], + [1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j, 1.+1.j]], + [[24. + 4.j, 68. + 4.j, 68. + 4.j, 68. + 4.j, 68. + 4.j, 48. + 4.j, 4. + 4.j], + [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j], + [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j], + [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j], + [92. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 228. + 4.j, 140. + 4.j, 4. + 4.j], + [72. + 4.j, 164. + 4.j, 164. + 4.j, 164. + 4.j, 164. + 4.j, 96. + 4.j, 4. + 4.j], + [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]], + dtype=COMPLEX_TYPE) + + + np.testing.assert_array_equal(object_array, expected_object_array, + err_msg="The object array has not been updated as expected") + + expected_object_array_denominator = np.array([[[12., 22., 22., 22., 22., 12., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [22., 42., 42., 42., 42., 22., 2.], + [12., 22., 22., 22., 22., 12., 2.], + [ 2., 2., 2., 2., 2., 2., 2.]], + + [[17., 27., 27., 27., 27., 17., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [27., 47., 47., 47., 47., 27., 7.], + [17., 27., 27., 27., 27., 17., 7.], + [ 7., 7., 7., 7., 7., 7., 7.]]], + dtype=FLOAT_TYPE) + + + np.testing.assert_array_equal(object_array_denominator_dev.get(), expected_object_array_denominator, + err_msg="The object array denominatorhas not been updated as expected") + + + def test_ob_update_atomics_REGRESSION(self): + self.ob_update_REGRESSION_tester(atomics=True) + + def test_ob_update_tiled_REGRESSION(self): + self.ob_update_REGRESSION_tester(atomics=False) + + def ob_update_UNITY_tester(self, atomics=True): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 2 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + object_array_denominator = np.empty_like(object_array, dtype=FLOAT_TYPE) + for idx in range(G): + object_array_denominator[idx] = np.ones((H, I)) * (5 * idx + 2) + + + POUK = PoUpdateKernel() + + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + + object_array_dev = cp.asarray(object_array) + object_array_denominator_dev = cp.asarray(object_array_denominator) + probe_dev = cp.asarray(probe) + exit_wave_dev = cp.asarray(exit_wave) + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = cp.asarray(addr2) + else: + addr_dev = cp.asarray(addr) + + # print(object_array_denominator) + POUK.ob_update(addr_dev, object_array_dev, object_array_denominator_dev, probe_dev, exit_wave_dev, atomics=atomics) + # print("\n\n cuda version") + # print(repr(object_array_dev.get())) + # print(repr(object_array_denominator_dev.get())) + nPOUK.ob_update(addr, object_array, object_array_denominator, probe, exit_wave) + # print("\n\n numpy version") + # print(repr(object_array_denominator)) + # print(repr(object_array)) + + + np.testing.assert_array_equal(object_array, object_array_dev.get(), + err_msg="The object array has not been updated as expected") + + + np.testing.assert_array_equal(object_array_denominator, object_array_denominator_dev.get(), + err_msg="The object array denominatorhas not been updated as expected") + + + def test_ob_update_atomics_UNITY(self): + self.ob_update_UNITY_tester(atomics=True) + + def test_ob_update_tiled_UNITY(self): + self.ob_update_UNITY_tester(atomics=False) + + def pr_update_REGRESSION_tester(self, atomics=True): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 2 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) + for idx in range(D): + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + + POUK = PoUpdateKernel() + + # print("probe array before:") + # print(repr(probe)) + # print("probe denominator array before:") + # print(repr(probe_denominator)) + + object_array_dev = cp.asarray(object_array) + probe_denominator_dev = cp.asarray(probe_denominator) + probe_dev = cp.asarray(probe) + exit_wave_dev = cp.asarray(exit_wave) + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = cp.asarray(addr2) + else: + addr_dev = cp.asarray(addr) + + + POUK.pr_update(addr_dev, probe_dev, probe_denominator_dev, object_array_dev, exit_wave_dev, atomics=atomics) + + # print("probe array after:") + # print(repr(probe)) + # print("probe denominator array after:") + # print(repr(probe_denominator)) + expected_probe = np.array([[[313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j], + [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j], + [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j], + [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j], + [313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j, 313.+1.j]], + + [[394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j], + [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j], + [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j], + [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j], + [394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j, 394.+2.j]]], + dtype=COMPLEX_TYPE) + + np.testing.assert_array_equal(probe_dev.get(), expected_probe, + err_msg="The probe has not been updated as expected") + + expected_probe_denominator = np.array([[[138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.], + [138., 138., 138., 138., 138.]], + + [[143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.], + [143., 143., 143., 143., 143.]]], + dtype=FLOAT_TYPE) + + np.testing.assert_array_equal(probe_denominator_dev.get(), expected_probe_denominator, + err_msg="The probe denominatorhas not been updated as expected") + + + def test_pr_update_atomics_REGRESSION(self): + self.pr_update_REGRESSION_tester(atomics=True) + + def test_pr_update_tiled_REGRESSION(self): + self.pr_update_REGRESSION_tester(atomics=False) + + def pr_update_UNITY_tester(self, atomics=True): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 2 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): # + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + probe_denominator = np.empty_like(probe, dtype=FLOAT_TYPE) + for idx in range(D): + probe_denominator[idx] = np.ones((E, F)) * (5 * idx + 2) + + POUK = PoUpdateKernel() + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + + # print("probe array before:") + # print(repr(probe)) + # print("probe denominator array before:") + # print(repr(probe_denominator)) + + object_array_dev = cp.asarray(object_array) + probe_denominator_dev = cp.asarray(probe_denominator) + probe_dev = cp.asarray(probe) + exit_wave_dev = cp.asarray(exit_wave) + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr, (2, 3, 0, 1))) + addr_dev = cp.asarray(addr2) + else: + addr_dev = cp.asarray(addr) + + + POUK.pr_update(addr_dev, probe_dev, probe_denominator_dev, object_array_dev, exit_wave_dev, atomics=atomics) + nPOUK.pr_update(addr, probe, probe_denominator, object_array, exit_wave) + + # print("probe array after:") + # print(repr(probe)) + # print("probe denominator array after:") + # print(repr(probe_denominator)) + + np.testing.assert_array_equal(probe, probe_dev.get(), + err_msg="The probe has not been updated as expected") + + np.testing.assert_array_equal(probe_denominator, probe_denominator_dev.get(), + err_msg="The probe denominatorhas not been updated as expected") + + + def test_pr_update_atomics_UNITY(self): + self.pr_update_UNITY_tester(atomics=True) + + def test_pr_update_tiled_UNITY(self): + self.pr_update_UNITY_tester(atomics=False) + + + def pr_update_ML_tester(self, atomics=False): + ''' + setup + ''' + addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() + ''' + test + ''' + POUK = PoUpdateKernel() + + POUK.allocate() # this doesn't do anything, but is the call pattern. + + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1))) + addr = cp.asarray(addr2) + + POUK.pr_update_ML(addr, probe, object_array, exit_wave, atomics=atomics) + + expected_probe = np.array([[[625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], + [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], + [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], + [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j], + [625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j, 625. + 1.j]], + + [[786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j], + [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j], + [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j], + [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j], + [786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j, 786. + 2.j]]], + dtype=COMPLEX_TYPE) + + np.testing.assert_array_equal(probe.get(), expected_probe, + err_msg="The probe has not been updated as expected") + + def test_pr_update_ML_atomics_REGRESSION(self): + self.pr_update_ML_tester(True) + + def test_pr_update_ML_tiled_REGRESSION(self): + self.pr_update_ML_tester(False) + + def ob_update_ML_tester(self, atomics=True): + ''' + setup + ''' + addr, object_array, object_array_denominator, probe, exit_wave, probe_denominator = self.prepare_arrays() + ''' + test + ''' + POUK = PoUpdateKernel() + + POUK.allocate() # this doesn't do anything, but is the call pattern. + + if not atomics: + addr2 = np.ascontiguousarray(np.transpose(addr.get(), (2, 3, 0, 1))) + addr = cp.asarray(addr2) + + POUK.ob_update_ML(addr, object_array, probe, exit_wave, atomics=atomics) + + expected_object_array = np.array( + [[[29. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 105. + 1.j, 77. + 1.j, 1. + 1.j], + [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j], + [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j], + [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j], + [153. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 401. + 1.j, 249. + 1.j, 1. + 1.j], + [125. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 297. + 1.j, 173. + 1.j, 1. + 1.j], + [1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j, 1. + 1.j]], + + [[44. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 132. + 4.j, 92. + 4.j, 4. + 4.j], + [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j], + [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j], + [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j], + [180. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 452. + 4.j, 276. + 4.j, 4. + 4.j], + [140. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 324. + 4.j, 188. + 4.j, 4. + 4.j], + [4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j, 4. + 4.j]]], + dtype=COMPLEX_TYPE) + + np.testing.assert_array_equal(object_array.get(), expected_object_array, + err_msg="The object array has not been updated as expected") + + def test_ob_update_ML_atomics_REGRESSION(self): + self.ob_update_ML_tester(True) + + def test_ob_update_ML_tiled_REGRESSION(self): + self.ob_update_ML_tester(False) + + def test_ob_update_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 2 + + probe_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel(queue_thread=self.stream) + + object_array_dev = cp.asarray(object_array) + probe_dev = cp.asarray(probe) + exit_wave_dev = cp.asarray(exit_wave) + auxiliary_wave_dev = cp.asarray(auxiliary_wave) + probe_norm_dev = cp.asarray(probe_norm) + addr_dev = cp.asarray(addr) + + POUK.pr_norm_local(addr_dev, probe_dev, probe_norm_dev) + POUK.ob_update_local(addr_dev, object_array_dev, probe_dev, exit_wave_dev, auxiliary_wave_dev, probe_norm_dev, a=0.5, b=0.5) + nPOUK.pr_norm_local(addr, probe, probe_norm) + nPOUK.ob_update_local(addr, object_array, probe, exit_wave, auxiliary_wave, probe_norm, a=0.5, b=0.5) + + np.testing.assert_allclose(object_array_dev.get(), object_array, rtol=1e-6, atol=1e-6, + err_msg="The object array has not been updated as expected") + + def test_pr_update_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + + exit_wave = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + exit_wave[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + auxiliary_wave = exit_wave.copy() * 1.5 + + object_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel() + + object_array_dev = cp.asarray(object_array) + probe_dev = cp.asarray(probe) + exit_wave_dev = cp.asarray(exit_wave) + auxiliary_wave_dev = cp.asarray(auxiliary_wave) + object_norm_dev = cp.asarray(object_norm) + addr_dev = cp.asarray(addr) + + POUK.ob_norm_local(addr_dev, object_array_dev, object_norm_dev) + POUK.pr_update_local(addr_dev, probe_dev, object_array_dev,exit_wave_dev, auxiliary_wave_dev, object_norm_dev, cp.max(object_norm_dev), a=0.5, b=0.5) + nPOUK.ob_norm_local(addr, object_array, object_norm) + nPOUK.pr_update_local(addr, probe, object_array, exit_wave, auxiliary_wave, object_norm, object_norm.max(), a=0.5, b=0.5) + + np.testing.assert_allclose(probe_dev.get(), probe, rtol=1e-6, atol=1e-6, + err_msg="The probe has not been updated as expected") + + def test_ob_norm_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + object_array = np.empty(shape=(G, H, I), dtype=COMPLEX_TYPE) + for idx in range(G): + object_array[idx] = np.ones((H, I)) * (3 * idx + 1) + 1j * np.ones((H, I)) * (3 * idx + 1) + object_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel(queue_thread=self.stream) + + object_array_dev = cp.asarray(object_array) + object_norm_dev = cp.asarray(object_norm) + addr_dev = cp.asarray(addr) + + POUK.ob_norm_local(addr_dev, object_array_dev, object_norm_dev) + nPOUK.ob_norm_local(addr, object_array, object_norm) + + np.testing.assert_allclose(object_norm_dev.get(), object_norm, rtol=1e-6, atol=1e-6, + err_msg="The object norm has not been updated as expected") + + def test_pr_norm_local_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + E = B # probe size y + F = C # probe size x + + npts_greater_than = 2 # how many points bigger than the probe the object is. + G = 2 # number of object modes + H = B + npts_greater_than # object size y + I = C + npts_greater_than # object size x + + scan_pts = 1 # one dimensional scan point number + + total_number_scan_positions = scan_pts ** 2 + total_number_modes = G * D + A = total_number_scan_positions * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + probe = np.empty(shape=(D, E, F), dtype=COMPLEX_TYPE) + for idx in range(D): + probe[idx] = np.ones((E, F)) * (idx + 1) + 1j * np.ones((E, F)) * (idx + 1) + probe_norm = np.empty(shape=(1,B,C), dtype=FLOAT_TYPE) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((total_number_scan_positions)) + Y = Y.reshape((total_number_scan_positions)) + + addr = np.zeros((total_number_scan_positions, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y):# + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [0, 0, 0], + [0, 0, 0]], dtype=INT_TYPE) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + from ptypy.accelerate.base.kernels import PoUpdateKernel as npPoUpdateKernel + nPOUK = npPoUpdateKernel() + POUK = PoUpdateKernel() + + probe_dev = cp.asarray(probe) + probe_norm_dev = cp.asarray(probe_norm) + addr_dev = cp.asarray(addr) + + POUK.pr_norm_local(addr_dev, probe_dev, probe_norm_dev) + nPOUK.pr_norm_local(addr, probe, probe_norm) + + np.testing.assert_allclose(probe_norm_dev.get(), probe_norm, rtol=1e-6, atol=1e-6, + err_msg="The probe norm has not been updated as expected") + + +if __name__ == '__main__': + unittest.main() diff --git a/test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py new file mode 100644 index 000000000..7e817fa60 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/position_correction_kernel_test.py @@ -0,0 +1,149 @@ +''' + + +''' + +import unittest +import numpy as np +from . import CupyCudaTest, have_cupy +from ptypy import utils as u + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.kernels import PositionCorrectionKernel + from ptypy.accelerate.base.kernels import PositionCorrectionKernel as abPositionCorrectionKernel + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + + +class PositionCorrectionKernelTest(CupyCudaTest): + + def setUp(self): + CupyCudaTest.setUp(self) + self.params = u.Param() + self.params.nshifts = 4 + self.params.method = "Annealing" + self.params.amplitude = 2e-9 + self.params.start = 0 + self.params.stop = 10 + self.params.max_shift = 2e-9 + self.params.amplitude_decay = True + self.resolution = [1e-9,1e-9] + + def update_addr_and_error_state_UNITY_helper(self, size, modes): + ## Arrange + addr = np.ones((size, modes, 5, 3), dtype=np.int32) + mangled_addr = 2 * addr + err_state = np.zeros((size,), dtype=np.float32) + err_state[5:] = 2. + err_sum = np.ones((size, ), dtype=np.float32) + addr_gpu = cp.asarray(addr) + mangled_addr_gpu = cp.asarray(mangled_addr) + err_state_gpu = cp.asarray(err_state) + err_sum_gpu = cp.asarray(err_sum) + aux = np.ones((1,1,1), dtype=np.complex64) + + ## Act + PCK = PositionCorrectionKernel(aux, modes, self.params, self.resolution, queue_thread=self.stream) + PCK.update_addr_and_error_state(addr_gpu, err_state_gpu, mangled_addr_gpu, err_sum_gpu) + abPCK = abPositionCorrectionKernel(aux, modes, self.params, self.resolution) + abPCK.update_addr_and_error_state(addr, err_state, mangled_addr, err_sum) + + ## Assert + np.testing.assert_array_equal(addr_gpu.get(), addr) + np.testing.assert_array_equal(err_state_gpu.get(), err_state) + + def test_update_addr_and_error_state_UNITY_small_onemode(self): + self.update_addr_and_error_state_UNITY_helper(4, 1) + + def test_update_addr_and_error_state_UNITY_large_onemode(self): + self.update_addr_and_error_state_UNITY_helper(323, 1) + + def test_update_addr_and_error_state_UNITY_small_multimode(self): + self.update_addr_and_error_state_UNITY_helper(4, 3) + + def test_update_addr_and_error_state_UNITY_large_multimode(self): + self.update_addr_and_error_state_UNITY_helper(323, 3) + + def log_likelihood_ml_UNITY(self): + ''' + setup + ''' + B = 5 # frame size y + C = 5 # frame size x + + D = 2 # number of probe modes + G = 2 # number of object modes + + E = B # probe size y + F = C # probe size x + + scan_pts = 2 # one dimensional scan point number + + N = scan_pts ** 2 + total_number_modes = G * D + A = N * total_number_modes # this is a 16 point scan pattern (4x4 grid) over all the modes + + f = np.empty(shape=(A, B, C), dtype=COMPLEX_TYPE) + for idx in range(A): + f[idx] = np.ones((B, C)) * (idx + 1) + 1j * np.ones((B, C)) * (idx + 1) + + fmag = np.empty(shape=(N, B, C), dtype=FLOAT_TYPE) # the measured magnitudes NxAxB + fmag_fill = np.arange(np.prod(fmag.shape)).reshape(fmag.shape).astype(fmag.dtype) + fmag[:] = fmag_fill + I = fmag**2 + + mask = np.empty(shape=(N, B, C), + dtype=FLOAT_TYPE) # the masks for the measured magnitudes either 1xAxB or NxAxB + mask_fill = np.ones_like(mask) + mask_fill[::2, ::2] = 0 # checkerboard for testing + mask[:] = mask_fill + w = mask /(I+1.) + + X, Y = np.meshgrid(range(scan_pts), range(scan_pts)) + X = X.reshape((N,)) + Y = Y.reshape((N,)) + + addr = np.zeros((N, total_number_modes, 5, 3), dtype=INT_TYPE) + + exit_idx = 0 + position_idx = 0 + for xpos, ypos in zip(X, Y): + mode_idx = 0 + for pr_mode in range(D): + for ob_mode in range(G): + addr[position_idx, mode_idx] = np.array([[pr_mode, 0, 0], + [ob_mode, ypos, xpos], + [exit_idx, 0, 0], + [position_idx, 0, 0], + [position_idx, 0, 0]]) + mode_idx += 1 + exit_idx += 1 + position_idx += 1 + + ''' + test + ''' + mask_sum = mask.sum(-1).sum(-1) + LLerr = np.zeros_like(mask_sum, dtype=np.float32) + f_d = cp.asarray(f) + w_d = cp.asarray(w) + I_d = cp.asarray(I) + addr_d = cp.asarray(addr) + LLerr_d = cp.asarray(LLerr) + + ## Act + PCK = PositionCorrectionKernel(f, total_number_modes, self.params, self.resolution, queue_thread=self.stream) + abPCK = abPositionCorrectionKernel(f, total_number_modes, self.params, self.resolution) + abPCK.log_likelihood_ml(f, addr, I, w, LLerr) + PCK.log_likelihood_ml(f_d, addr_d, I_d, w_d, LLerr_d) + + expected_err_phot = LLerr + measured_err_phot = LLerr_d.get() + + np.testing.assert_allclose(expected_err_phot, measured_err_phot, err_msg="Numpy log-likelihood error " + "is \n%s, \nbut gpu log-likelihood error is \n%s, \n " % ( + repr(expected_err_phot), + repr(measured_err_phot)), rtol=1e-5) diff --git a/test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py b/test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py new file mode 100644 index 000000000..c221d59c0 --- /dev/null +++ b/test/accelerate_tests/cuda_cupy_tests/propagation_kernel_test.py @@ -0,0 +1,157 @@ +''' + +''' + +import numpy as np +import ptypy.utils as u +from . import CupyCudaTest, have_cupy + +if have_cupy(): + import cupy as cp + from ptypy.accelerate.cuda_cupy.kernels import PropagationKernel + +from ptypy.core import geometry +from ptypy.core import Base as theBase + +# subclass for dictionary access +Base = type('Base',(theBase,),{}) + +COMPLEX_TYPE = np.complex64 +FLOAT_TYPE = np.float32 +INT_TYPE = np.int32 + +class PropagationKernelTest(CupyCudaTest): + + def set_up_farfield(self,shape, resolution=None): + P = Base() + P.CType = COMPLEX_TYPE + P.Ftype = FLOAT_TYPE + g = u.Param() + g.energy = None # u.keV2m(1.0)/6.32e-7 + g.lam = 5.32e-7 + g.distance = 15e-2 + g.psize = 24e-6 + g.shape = shape + g.propagation = "farfield" + if resolution is not None: + g.resolution = resolution + G = geometry.Geo(owner=P, pars=g) + return G + + def set_up_nearfield(self, shape): + P = Base() + P.CType = COMPLEX_TYPE + P.Ftype = FLOAT_TYPE + g = u.Param() + g.energy = None # u.keV2m(1.0)/6.32e-7 + g.lam = 1e-10 + g.distance = 1.0 + g.psize = 100e-9 + g.shape = shape + g.propagation = "nearfield" + G = geometry.Geo(owner=P, pars=g) + return G + + def test_farfield_propagator_forward_UNITY(self): + # setup + SH = (2,16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[:,5:11,5:11] = 1. + 2j + aux_d = cp.asarray(aux) + geo = self.set_up_farfield(SH[1:]) + + # test + aux = geo.propagator.fw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.fw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_farfield_propagator_backward_UNITY(self): + # setup + SH = (2,16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[:,5:11,5:11] = 1. + 2j + aux_d = cp.asarray(aux) + geo = self.set_up_farfield(SH[1:]) + + # test + aux = geo.propagator.bw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.bw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_farfield_propagator_forward_crop_pad_UNITY(self): + # setup + SH = (2,16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[:,5:11,5:11] = 1. + 2j + aux_d = cp.asarray(aux) + geo = self.set_up_farfield(SH[1:]) + geo = self.set_up_farfield(SH[1:], resolution=0.5*geo.resolution) + + # test + aux = geo.propagator.fw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.fw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_farfield_propagator_backward_crop_pad_UNITY(self): + # setup + SH = (2,16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[:,5:11,5:11] = 1. + 2j + aux_d = cp.asarray(aux) + geo = self.set_up_farfield(SH[1:]) + geo = self.set_up_farfield(SH[1:], resolution=0.5*geo.resolution) + + # test + aux = geo.propagator.bw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.bw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_nearfield_propagator_forward_UNITY(self): + # setup + SH = (2,16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[:,5:11,5:11] = 1. + 2j + aux_d = cp.asarray(aux) + geo = self.set_up_nearfield(SH[1:]) + + # test + aux = geo.propagator.fw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.fw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) + + def test_nearfield_propagator_backward_UNITY(self): + # setup + SH = (2,16,16) + aux = np.zeros((SH), dtype=COMPLEX_TYPE) + aux[:,5:11,5:11] = 1. + 2j + aux_d = cp.asarray(aux) + geo = self.set_up_nearfield(SH[1:]) + + # test + aux = geo.propagator.bw(aux) + PropK = PropagationKernel(aux_d, geo.propagator, queue_thread=self.stream) + PropK.allocate() + PropK.bw(aux_d, aux_d) + + np.testing.assert_allclose(aux_d.get(), aux, atol=1e-06, rtol=5e-5, + err_msg="Numpy aux is \n%s, \nbut gpu aux is \n %s, \n " % (repr(aux), repr(aux_d.get()))) \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py index 1220702b7..5816e3bf3 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py @@ -25,6 +25,7 @@ def helper(self, FFT): t2 = time.time() dur1 = t2 - t1 f_dev = gpuarray.to_gpu(f) + self.stream.synchronize() # measure with events to make sure that something actually # happened in the right stream From df073704eb5e6bdb3b609bbd1623ca83339b2ea2 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Tue, 24 Jan 2023 16:42:27 +0000 Subject: [PATCH 03/37] bump version to 0.8 --- ptypy/version.py | 6 +++--- release_notes.md | 11 +++++++++++ 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/ptypy/version.py b/ptypy/version.py index 272c25708..ee3909a1b 100644 --- a/ptypy/version.py +++ b/ptypy/version.py @@ -1,7 +1,7 @@ -short_version = '0.7.0' -version = '0.7.0' -release = True +short_version = '0.8.0' +version = '0.8.0' +release = False if not release: version += '.dev' diff --git a/release_notes.md b/release_notes.md index 61ea173d2..e2fd647fc 100644 --- a/release_notes.md +++ b/release_notes.md @@ -1,3 +1,14 @@ +# PtyPy 0.8 release notes + +An alternative CUDA implementation based on [`cupy`](https://cupy.dev/) +has been implemented, providing the same feature as the `PyCuda` based +engine. +It can be imported using +```python +import ptypy +ptypy.load_gpu_engines('cupy') +``` + # PtyPy 0.7 release notes This release is focused on improving the usability of PtyPy in Jupyter notebooks in preparation for the From 1bb839745cfb210cfab9e07ba565be713aa883b5 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Wed, 15 Feb 2023 14:50:21 +0000 Subject: [PATCH 04/37] non-threaded autoplotting (Jupyter) should only be on when autoplot is active (#480) --- ptypy/core/ptycho.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py index 36ed97a28..926aad919 100644 --- a/ptypy/core/ptycho.py +++ b/ptypy/core/ptycho.py @@ -703,7 +703,7 @@ def run(self, label=None, epars=None, engine=None): 'Exit %.2e' % tuple(err)) imsg = '%(engine)s: Iteration # %(iteration)d/%(numiter)d :: ' %info + \ 'Fourier %.2e, Photons %.2e, Exit %.2e' %tuple(err) - if not self.p.io.autoplot.threaded: + if (self.p.io.autoplot.active) and (not self.p.io.autoplot.threaded): if not (info["iteration"] % self.p.io.autoplot.interval): if self._jupyter_client is None: from IPython import display From 1c9d832f807dd67d1d0ac0ff46688b9454902559 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Wed, 15 Feb 2023 14:51:23 +0000 Subject: [PATCH 05/37] SimSacn: reset diff storage to zeros (#479) --- ptypy/simulations/simscan.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ptypy/simulations/simscan.py b/ptypy/simulations/simscan.py index 1785ff7b9..a8444f8e4 100644 --- a/ptypy/simulations/simscan.py +++ b/ptypy/simulations/simscan.py @@ -164,6 +164,10 @@ def __init__(self, pars=None, **kwargs): P = self.manipulate_ptycho(P) ############################################################# + # Make sure all diff storages are empty + for name, storage in P.diff.S.items(): + storage.data.fill(0.) + # Simulate diffraction signal logger.info('Propagating exit waves.') for name,pod in P.pods.items(): From 4d46f5f0a18cc961599479922a19605a17d7e057 Mon Sep 17 00:00:00 2001 From: jsouter <107045742+jsouter@users.noreply.github.com> Date: Wed, 22 Feb 2023 08:55:00 +0000 Subject: [PATCH 06/37] Load data during creation with SwmrLoader class (#428) * Added SwmrLoader and introduced new logic for live processing --------- Co-authored-by: Benedikt Daurer --- ptypy/core/manager.py | 4 + ptypy/core/ptycho.py | 20 ++++- ptypy/engines/base.py | 1 + ptypy/experiment/hdf5_loader.py | 139 +++++++++++--------------------- ptypy/experiment/swmr_loader.py | 125 ++++++++++++++++++++++++++++ 5 files changed, 197 insertions(+), 92 deletions(-) create mode 100644 ptypy/experiment/swmr_loader.py diff --git a/ptypy/core/manager.py b/ptypy/core/manager.py index 3bb7278ef..0d9d7d341 100644 --- a/ptypy/core/manager.py +++ b/ptypy/core/manager.py @@ -1645,6 +1645,10 @@ def _from_dict(cls, dct): @property def data_available(self): return any(s.data_available for s in list(self.scans.values())) + + @property + def end_of_scan(self): + return all(s.ptyscan.end_of_scan for s in list(self.scans.values())) def new_data(self): """ diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py index 926aad919..eab95b33f 100644 --- a/ptypy/core/ptycho.py +++ b/ptypy/core/ptycho.py @@ -109,6 +109,12 @@ class Ptycho(Base): lowlim = 1 userlevel = 1 + [min_frames_for_recon] + default = 0 + type = int + help = Minimum number of frames to be loaded before reconstruction can start. + doc = For on-the-fly (live) processing, the first reconstruction engine will wait until this many frames have been loaded. + [dry_run] default = False help = Dry run switch @@ -524,8 +530,9 @@ def init_data(self, print_stats=True): # Load the data. This call creates automatically the scan managers, # which create the views and the PODs. Sets self.new_data with LogTime(self.p.io.benchmark == 'all') as t: - self.new_data = self.model.new_data() - if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration + while not self.new_data: + self.new_data = self.model.new_data() + if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration # Print stats parallel.barrier() @@ -677,6 +684,10 @@ def run(self, label=None, epars=None, engine=None): engine.prepare() if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.engine_prepare += t.duration + # Keep loading data, unless we have reached minimum nr. of frames or end of scan + if (len(self.diff.V) < self.p.min_frames_for_recon) and not self.model.end_of_scan: + continue + auto_save = self.p.io.autosave if auto_save.active and auto_save.interval > 0: if engine.curiter % auto_save.interval == 0: @@ -686,6 +697,11 @@ def run(self, label=None, epars=None, engine=None): self.runtime.last_save = engine.curiter logger.info(headerline()) + # If not end of scan, expand total number of iterations + # This is to make sure that the specified nr. of iterations is guaranteed once all data is loaded + if not self.model.end_of_scan: + engine.numiter += engine.p.numiter_contiguous + # One iteration with LogTime(self.p.io.benchmark == 'all') as t: engine.iterate() diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py index 78f1f6e04..a7ffb6cee 100644 --- a/ptypy/engines/base.py +++ b/ptypy/engines/base.py @@ -42,6 +42,7 @@ class BaseEngine(object): type = int lowlim = 1 help = Total number of iterations + doc = For on-the-fly (live) processing, the reconstruction engine will iterate at least this many times after all data has been loaded. [numiter_contiguous] default = 1 diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py index 2459069c9..80ecc2b87 100644 --- a/ptypy/experiment/hdf5_loader.py +++ b/ptypy/experiment/hdf5_loader.py @@ -42,19 +42,6 @@ class Hdf5Loader(PtyScan): It is assumed in this latter case that the fast axis in the scan corresponds the fast axis on disc (i.e. C-ordered layout). - [intensities.is_swmr] - default = False - type = bool - help = If True, then intensities are assumed to be a swmr dataset that is being written as processing - is taking place. - - [intensities.live_key] - default = None - type = str - help = Key to live keys inside the intensities.file (used only if is_swmr is True) - doc = Live_keys indicate where the data collection has progressed to. They are zero at the - scan start, but non-zero when the position is complete. - [intensities.file] default = None type = str @@ -79,19 +66,6 @@ class Hdf5Loader(PtyScan): * axis_data.shape (C,) for data.shape (C*D, frame_size_m, frame_size_n) where D is the size of the other axis. - [positions.is_swmr] - default = False - type = bool - help = If True, positions are assumed to be a swmr dataset that is being written as processing - is taking place. - - [positions.live_key] - default = None - type = str - help = Live_keys indicate where the data collection has progressed to. They are zero at the - scan start, but non-zero when the position is complete. If None whilst positions.is_swmr - is True, use "intensities.live_key". - [positions.file] default = None type = str @@ -199,20 +173,6 @@ class Hdf5Loader(PtyScan): help = Parameters for per-point normalisation (i.e. ion chamber reading). doc = The shape of loaded data is assumed to have the same dimensionality as data.shape[:-2] - [normalisation.is_swmr] - default = False - type = bool - help = If this is set to be true, then normalisations are assumed to be swmr datasets that are being written as processing - is taking place. - - [normalisation.live_key] - default = None - type = str - help = If normalisation.is_swmr is true then we need a live_key to know where the data collection has progressed to. - This is the key to these live keys inside the normalisation.file. If None, whilst normalisation.is_swmr is - True, then we just assume the same keys work for both normalisation and intensities. They are zero at the - scan start, but non-zero when the position is complete. - [normalisation.file] default = None type = str @@ -340,7 +300,7 @@ class Hdf5Loader(PtyScan): and converted to electron wavelengths. """ - def __init__(self, pars=None, **kwargs): + def __init__(self, pars=None, swmr=False, **kwargs): """ hdf5 data loader """ @@ -369,8 +329,11 @@ def __init__(self, pars=None, **kwargs): self.preview_indices = None self.framefilter = None self._is_spectro_scan = False - + self._is_swmr = swmr + self.fhandle_intensities = None + self.fhandle_positions_fast = None + self.fhandle_positions_slow = None self.fhandle_darkfield = None self.fhandle_flatfield = None self.fhandle_normalisation = None @@ -417,17 +380,12 @@ def _params_check(self): self.p.positions.fast_key]: raise RuntimeError("Missing some information about either the positions or the intensity mapping!") - if True in [self.p.intensities.is_swmr, - self.p.positions.is_swmr, - self.p.normalisation.is_swmr]: - raise NotImplementedError("Currently swmr functionality is not implemented! Coming soon...") - def _spectro_scan_check(self): """ make adjustments if dealing with a spectro scan """ if None not in [self.p.recorded_energy.file, self.p.recorded_energy.key]: - with h5.File(self.p.recorded_energy.file, 'r') as f: + with h5.File(self.p.recorded_energy.file, 'r', swmr=self._is_swmr) as f: _energy_dset = f[self.p.recorded_energy.key] if len(_energy_dset.shape): if _energy_dset.shape[0] > 1: @@ -436,32 +394,32 @@ def _spectro_scan_check(self): self.p.outer_index = 0 if self._is_spectro_scan: log(3, "This is appears to be a spectro scan, selecting index = {}".format(self.p.outer_index)) + if self._is_spectro_scan and self._is_swmr: + raise RuntimeError("Spectro scans are currently not compatible with SWMR mode") def _prepare_intensity_and_positions(self): """ Prep for loading intensity and position data """ - self.fhandle_intensities = h5.File(self.p.intensities.file, 'r') + self.fhandle_intensities = h5.File(self.p.intensities.file, 'r', swmr=self._is_swmr) self.intensities = self.fhandle_intensities[self.p.intensities.key] self.intensities_dtype = self.intensities.dtype self.data_shape = self.intensities.shape if self._is_spectro_scan and self.p.outer_index is not None: self.data_shape = tuple(np.array(self.data_shape)[1:]) - with h5.File(self.p.positions.file, 'r') as f: - fast_axis = f[self.p.positions.fast_key][...] + self.fhandle_positions_fast = h5.File(self.p.positions.file, 'r', swmr=self._is_swmr) + self.fast_axis = self.fhandle_positions_fast[self.p.positions.fast_key] if self._is_spectro_scan and self.p.outer_index is not None: - fast_axis = fast_axis[self.p.outer_index] - self.fast_axis = np.squeeze(fast_axis) if fast_axis.ndim > 2 else fast_axis - self.positions_fast_shape = self.fast_axis.shape + self.fast_axis = self.fast_axis[self.p.outer_index] + self.positions_fast_shape = np.squeeze(self.fast_axis).shape if self.fast_axis.ndim > 2 else self.fast_axis.shape - with h5.File(self.p.positions.file, 'r') as f: - slow_axis = f[self.p.positions.slow_key][...] + self.fhandle_positions_slow = h5.File(self.p.positions.file, 'r', swmr=self._is_swmr) + self.slow_axis = self.fhandle_positions_slow[self.p.positions.slow_key] if self._is_spectro_scan and self.p.outer_index is not None: - slow_axis = slow_axis[self.p.outer_index] - self.slow_axis = np.squeeze(slow_axis) if slow_axis.ndim > 2 else slow_axis - self.positions_slow_shape = self.slow_axis.shape + self.slow_axis = self.slow_axis[self.p.outer_index] + self.positions_slow_shape = np.squeeze(self.slow_axis).shape if self.slow_axis.ndim > 2 else self.slow_axis.shape log(3, "The shape of the \n\tdiffraction intensities is: {}\n\tslow axis data:{}\n\tfast axis data:{}".format(self.data_shape, self.positions_slow_shape, @@ -475,7 +433,7 @@ def _prepare_framefilter(self): Prep for framefilter """ if None not in [self.p.framefilter.file, self.p.framefilter.key]: - with h5.File(self.p.framefilter.file, 'r') as f: + with h5.File(self.p.framefilter.file, 'r', swmr=self._is_swmr) as f: self.framefilter = f[self.p.framefilter.key][()].squeeze() > 0 # turn into boolean if self._is_spectro_scan and self.p.outer_index is not None: self.framefilter = self.framefilter[self.p.outer_index] @@ -493,7 +451,7 @@ def _prepare_darkfield(self): Prep for darkfield """ if None not in [self.p.darkfield.file, self.p.darkfield.key]: - self.fhandle_darkfield = h5.File(self.p.darkfield.file, 'r') + self.fhandle_darkfield = h5.File(self.p.darkfield.file, 'r', swmr=self._is_swmr) self.darkfield = self.fhandle_darkfield[self.p.darkfield.key] log(3, "The darkfield has shape: {}".format(self.darkfield.shape)) if self.darkfield.shape == self.data_shape: @@ -516,7 +474,7 @@ def _prepare_flatfield(self): Prep for flatfield """ if None not in [self.p.flatfield.file, self.p.flatfield.key]: - self.fhandle_flatfield = h5.File(self.p.flatfield.file, 'r') + self.fhandle_flatfield = h5.File(self.p.flatfield.file, 'r', swmr=self._is_swmr) self.flatfield = self.fhandle_flatfield[self.p.flatfield.key] log(3, "The flatfield has shape: {}".format(self.flatfield.shape)) if self.flatfield.shape == self.data_shape: @@ -535,7 +493,7 @@ def _prepare_mask(self): Prep for mask """ if None not in [self.p.mask.file, self.p.mask.key]: - self.fhandle_mask = h5.File(self.p.mask.file, 'r') + self.fhandle_mask = h5.File(self.p.mask.file, 'r', swmr=self._is_swmr) self.mask = self.fhandle_mask[self.p.mask.key] self.mask_dtype = self.mask.dtype log(3, "The mask has shape: {}".format(self.mask.shape)) @@ -557,7 +515,7 @@ def _prepare_normalisation(self): Prep for normalisation """ if None not in [self.p.normalisation.file, self.p.normalisation.key]: - self.fhandle_normalisation = h5.File(self.p.normalisation.file, 'r') + self.fhandle_normalisation = h5.File(self.p.normalisation.file, 'r', swmr=self._is_swmr) self.normalisation = self.fhandle_normalisation[self.p.normalisation.key] self.normalisation_mean = self.normalisation[:].mean() self.normalisation_std = self.normalisation[:].std() @@ -577,7 +535,7 @@ def _prepare_meta_info(self): Prep for meta info (energy, distance, psize) """ if None not in [self.p.recorded_energy.file, self.p.recorded_energy.key]: - with h5.File(self.p.recorded_energy.file, 'r') as f: + with h5.File(self.p.recorded_energy.file, 'r', swmr=self._is_swmr) as f: if self._is_spectro_scan and self.p.outer_index is not None: self.p.energy = float(f[self.p.recorded_energy.key][self.p.outer_index]) else: @@ -587,13 +545,13 @@ def _prepare_meta_info(self): log(3, "loading energy={} from file".format(self.p.energy)) if None not in [self.p.recorded_distance.file, self.p.recorded_distance.key]: - with h5.File(self.p.recorded_distance.file, 'r') as f: + with h5.File(self.p.recorded_distance.file, 'r', swmr=self._is_swmr) as f: self.p.distance = float(f[self.p.recorded_distance.key][()] * self.p.recorded_distance.multiplier) self.meta.distance = self.p.distance log(3, "loading distance={} from file".format(self.p.distance)) if None not in [self.p.recorded_psize.file, self.p.recorded_psize.key]: - with h5.File(self.p.recorded_psize.file, 'r') as f: + with h5.File(self.p.recorded_psize.file, 'r', swmr=self._is_swmr) as f: self.p.psize = float(f[self.p.recorded_psize.key][()] * self.p.recorded_psize.multiplier) self.info.psize = self.p.psize log(3, "loading psize={} from file".format(self.p.psize)) @@ -642,13 +600,12 @@ def load_unmapped_raster_scan(self, indices): intensities = {} positions = {} weights = {} - sh = self.slow_axis.shape for ii in indices: slow_idx, fast_idx = self.preview_indices[:, ii] - intensity_index = slow_idx * sh[1] + fast_idx + intensity_index = slow_idx * self.slow_axis.shape[1] + fast_idx weights[ii], intensities[ii] = self.get_corrected_intensities(intensity_index) - positions[ii] = np.array([self.slow_axis[slow_idx, fast_idx] * self.p.positions.slow_multiplier, - self.fast_axis[slow_idx, fast_idx] * self.p.positions.fast_multiplier]) + positions[ii] = np.array([np.squeeze(self.slow_axis[slow_idx, fast_idx]) * self.p.positions.slow_multiplier, + np.squeeze(self.fast_axis[slow_idx, fast_idx]) * self.p.positions.fast_multiplier]) log(3, 'Data loaded successfully.') return intensities, positions, weights @@ -658,9 +615,9 @@ def load_mapped_and_raster_scan(self, indices): weights = {} for jj in indices: slow_idx, fast_idx = self.preview_indices[:, jj] - weights[jj], intensities[jj] = self.get_corrected_intensities((slow_idx, fast_idx)) # or the other way round??? - positions[jj] = np.array([self.slow_axis[slow_idx, fast_idx] * self.p.positions.slow_multiplier, - self.fast_axis[slow_idx, fast_idx] * self.p.positions.fast_multiplier]) + weights[jj], intensities[jj] = self.get_corrected_intensities((slow_idx, fast_idx)) + positions[jj] = np.array([np.squeeze(self.slow_axis[slow_idx, fast_idx]) * self.p.positions.slow_multiplier, + np.squeeze(self.fast_axis[slow_idx, fast_idx]) * self.p.positions.fast_multiplier]) log(3, 'Data loaded successfully.') return intensities, positions, weights @@ -671,9 +628,8 @@ def load_mapped_and_arbitrary_scan(self, indices): for ii in indices: jj = self.preview_indices[ii] weights[ii], intensities[ii] = self.get_corrected_intensities(jj) - positions[ii] = np.array([self.slow_axis[jj] * self.p.positions.slow_multiplier, - self.fast_axis[jj] * self.p.positions.fast_multiplier]) - + positions[ii] = np.array([np.squeeze(self.slow_axis[jj]) * self.p.positions.slow_multiplier, + np.squeeze(self.fast_axis[jj]) * self.p.positions.fast_multiplier]) log(3, 'Data loaded successfully.') return intensities, positions, weights @@ -752,8 +708,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape, log(3, "Everything is wonderful, each diffraction point has a co-ordinate.") self._ismapped = True - slow_axis_bounds = [0, self.slow_axis.shape[0]] - fast_axis_bounds = [0, self.fast_axis.shape[-1]] + slow_axis_bounds = [0, self.positions_slow_shape[0]] + fast_axis_bounds = [0, self.positions_fast_shape[-1]] set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds @@ -777,6 +733,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape, if self.framefilter is not None: self.preview_indices = self.preview_indices[:,self.framefilter[indices[1][::skip,::skip], indices[0][::skip,::skip]].flatten()] self.num_frames = len(self.preview_indices[0]) + else: if (set_slow_axis_bounds is not None) and (set_fast_axis_bounds is not None): log(3, "Setting slow axis bounds for an arbitrary mapped scan doesn't make sense. " @@ -799,8 +756,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape, axis_data.shape (C, D) for data.shape (C*D, frame_size_m, frame_size_n) , ''' log(3, "Positions are raster, but data is a list of frames. Unpacking the data to match the positions...") - slow_axis_bounds = [0, self.slow_axis.shape[0]] - fast_axis_bounds = [0, self.fast_axis.shape[-1]] + slow_axis_bounds = [0, self.positions_slow_shape[0]] + fast_axis_bounds = [0, self.positions_fast_shape[-1]] set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds @@ -830,8 +787,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape, axis_data.shape (C,) for data.shape (C, D, frame_size_m, frame_size_n) where D is the size of the other axis, ''' log(3, "Assuming the axes are 1D and need to be meshed to match the raster style data") - slow_axis_bounds = [0, self.slow_axis.shape[0]] - fast_axis_bounds = [0, self.fast_axis.shape[0]] + slow_axis_bounds = [0, self.positions_slow_shape[0]] + fast_axis_bounds = [0, self.positions_fast_shape[0]] set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds @@ -861,8 +818,8 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape, cases covered: axis_data.shape (C,) for data.shape (C*D, frame_size_m, frame_size_n) where D is the size of the other axis. ''' - slow_axis_bounds = [0,self.slow_axis.shape[0]] - fast_axis_bounds = [0, self.fast_axis.shape[0]] + slow_axis_bounds = [0,self.positions_slow_shape[0]] + fast_axis_bounds = [0, self.positions_fast_shape[0]] set_slow_axis_bounds = self.p.positions.bounding_box.slow_axis_bounds set_fast_axis_bounds = self.p.positions.bounding_box.fast_axis_bounds @@ -890,7 +847,7 @@ def compute_scan_mapping_and_trajectory(self, data_shape, positions_fast_shape, else: raise IOError("I don't know what to do with these positions/data shapes") else: - raise IOError("I don't know what to do with these positions/data shapes") + raise IOError(f"I don't know what to do with these positions/data shapes: {data_shape}, {positions_slow_shape}, {positions_fast_shape}") def _finalize(self): """ @@ -898,10 +855,12 @@ def _finalize(self): """ super()._finalize() for h in [self.fhandle_intensities, - self.fhandle_darkfield, - self.fhandle_flatfield, - self.fhandle_normalisation, - self.fhandle_mask]: + self.fhandle_positions_fast, + self.fhandle_positions_slow, + self.fhandle_darkfield, + self.fhandle_flatfield, + self.fhandle_normalisation, + self.fhandle_mask]: try: h.close() except: diff --git a/ptypy/experiment/swmr_loader.py b/ptypy/experiment/swmr_loader.py new file mode 100644 index 000000000..e82417ecc --- /dev/null +++ b/ptypy/experiment/swmr_loader.py @@ -0,0 +1,125 @@ +# -*- coding: utf-8 -*- +"""\ +Scan loading recipe for the Diamond beamlines. + +This file is part of the PTYPY package. + + :copyright: Copyright 2014 by the PTYPY team, see AUTHORS. + :license: see LICENSE for details. +""" +import h5py as h5 + +from ptypy.experiment import register +from ptypy.experiment.hdf5_loader import Hdf5Loader +from ptypy.utils.verbose import log + +try: + from swmr_tools import KeyFollower + +except ImportError: + log(3, "The SWMR loader requires swmr_tools to be installed," + " try pip install swmr_tools") + raise ImportError + + +@register() +class SwmrLoader(Hdf5Loader): + """ + This is an attempt to load data from a live SWMR file that is still being written to. + + Defaults: + + [name] + default = 'SwmrLoader' + type = str + help = + + [intensities.live_key] + default = None + type = str + help = Key to live keys inside the intensities file + doc = Live_keys indicate where the data collection has progressed to. + They are zero at the scan start, but non-zero when the position + is complete. + + [positions.live_fast_key] + default = None + type = str + help = Key to live key for fast axis inside the positions file + doc = Live_keys indicate where the data collection has progressed to. + They are zero at the scan start, but non-zero when the position + is complete. + + [positions.live_slow_key] + default = None + type = str + help = Key to live key for slow axis inside the positions file + doc = Live_keys indicate where the data collection has progressed to. + They are zero at the scan start, but non-zero when the position + is complete. + + """ + def __init__(self, *args, **kwargs): + super().__init__(*args, swmr=True, **kwargs) + + def _params_check(self): + super()._params_check() + + # Check if we have been given the live keys + if None in [self.p.intensities.live_key, + self.p.positions.live_slow_key, + self.p.positions.live_fast_key]: + raise RuntimeError("Missing live keys to intensities or positions") + + # Check that intensities and positions (and their live keys) + # are loaded from the same file + if self.p.intensities.file != self.p.positions.file: + raise RuntimeError("Intensities and positions file should be same") + + def _prepare_intensity_and_positions(self): + super()._prepare_intensity_and_positions() + self.kf = KeyFollower((self.fhandle_intensities[self.p.intensities.live_key], + self.fhandle_positions_slow[self.p.positions.live_slow_key], + self.fhandle_positions_fast[self.p.positions.live_fast_key]), + timeout=5) + + def compute_scan_mapping_and_trajectory(self,*args): + super().compute_scan_mapping_and_trajectory(*args) + assert isinstance(self.slow_axis, h5.Dataset), "Scantype = {:s} and mapped={:} is not compatible with the SwmrLoader".format(self._scantype, self._ismapped) + + def get_data_chunk(self, *args, **kwargs): + self.kf.refresh() + self.intensities.refresh() + self.slow_axis.refresh() + self.fast_axis.refresh() + # refreshing here to update before Ptyscan.get_data_chunk calls check and load + return super().get_data_chunk(*args, **kwargs) + + def check(self, frames=None, start=None): + """ + Check the live SWMR file for available frames. + """ + if start is None: + start = self.framestart + + if frames is None: + frames = self.min_frames + + available = min(self.kf.get_current_max() + 1, self.num_frames) + new_frames = available - start + # not reached expected nr. of frames + if new_frames <= frames: + # but its last chunk of scan so load it anyway + if available == self.num_frames: + frames_accessible = new_frames + end_of_scan = 1 + # otherwise, do nothing + else: + end_of_scan = 0 + frames_accessible = 0 + # reached expected nr. of frames + else: + frames_accessible = frames + end_of_scan = 0 + + return frames_accessible, end_of_scan From 20d02ea9baea5d2699fcf3fb387fd7c9202e6e36 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Wed, 1 Mar 2023 13:54:54 +0000 Subject: [PATCH 07/37] interactive plotting: move Ipython dependency into jupyter client (#482) * move Ipython dependency into jupyter client * added explanation to local ipython import --- ptypy/core/ptycho.py | 3 +-- ptypy/utils/plot_client.py | 10 ++++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py index eab95b33f..f4034e23e 100644 --- a/ptypy/core/ptycho.py +++ b/ptypy/core/ptycho.py @@ -722,11 +722,10 @@ def run(self, label=None, epars=None, engine=None): if (self.p.io.autoplot.active) and (not self.p.io.autoplot.threaded): if not (info["iteration"] % self.p.io.autoplot.interval): if self._jupyter_client is None: - from IPython import display from ptypy.utils.plot_client import _JupyterClient self._jupyter_client = _JupyterClient(self, autoplot_pars=self.p.io.autoplot, layout_pars=self.p.io.autoplot.layout) self._jupyter_client.runtime.update(self.runtime) - display.display(self._jupyter_client.plot(title=imsg), clear=True) + self._jupyter_client.display(imsg) else: ilog_streamer(imsg) diff --git a/ptypy/utils/plot_client.py b/ptypy/utils/plot_client.py index 6f0eb1bc3..e458f6531 100644 --- a/ptypy/utils/plot_client.py +++ b/ptypy/utils/plot_client.py @@ -716,6 +716,12 @@ def __init__(self, ptycho, autoplot_pars=None, layout_pars=None): in_thread=False) self.initialized = False + # not ideal but currently best solution + # avoiding a module-level import of Ipython + # since its not part of the core dependencies + import IPython + self.ipython = IPython + def plot(self, title=""): if not self.initialized: self.update_plot_layout() @@ -725,6 +731,10 @@ def plot(self, title=""): plt.close(self.plot_fig) return self.plot_fig + def display(self,title): + self.ipython.display.display(self.plot(title=title), clear=True) + + def figure_from_ptycho(P, pars=None): """ Returns a matplotlib figure displaying a reconstruction From 6ecaaa3bd3ed0325012b717ac3b4909fd234d216 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Thu, 22 Jun 2023 16:29:43 +0100 Subject: [PATCH 08/37] Python 3.11 compatibility (#489) * add python 3.11 and remove MPI from matrix * replace deprecated np.bool * more replacmenets for deprecated np.bool and np.float * more replacements for deprecated np.int * drop dependency on specific Python version * unit test cases should return None * use raw docstring to avoid invalid escape sequence --- .github/workflows/test.yml | 11 +++-------- cufft/dependencies.yml | 2 +- dependencies_core.yml | 2 +- dependencies_dev.yml | 2 +- dependencies_full.yml | 2 +- ptypy/accelerate/base/kernels.py | 2 +- ptypy/accelerate/cuda_cupy/dependencies.yml | 2 +- ptypy/accelerate/cuda_pycuda/dependencies.yml | 2 +- ptypy/accelerate/ocl_pyopencl/npy_kernels.py | 2 +- .../accelerate/ocl_pyopencl/npy_kernels_for_block.py | 2 +- ptypy/accelerate/ocl_pyopencl/ocl_fft.py | 2 +- ...ocl_kernels_self_contained_for_future_reference.py | 2 +- ptypy/core/sample.py | 2 +- ptypy/core/xy.py | 2 +- ptypy/custom/ePIE_parallel.py | 2 +- ptypy/engines/Bragg3d_engines.py | 2 +- ptypy/experiment/cSAXS.py | 2 +- ptypy/experiment/optiklabor.py | 2 +- ptypy/simulations/detector.py | 8 ++++---- ptypy/utils/misc.py | 2 +- ptypy/utils/scripts.py | 4 ++-- test/template_tests/prep_and_run_moonflower_test.py | 5 ----- 22 files changed, 27 insertions(+), 37 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index b95689973..4ec05b6b6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,16 +24,11 @@ jobs: max-parallel: 10 fail-fast: false matrix: - python-version: ['3.7','3.8','3.9','3.10'] - mpi: ['mpich', 'openmpi'] - name: Testing with ${{ matrix.mpi }} and Python ${{ matrix.python-version }} + python-version: ['3.8','3.9','3.10', '3.11'] + name: Testing with Python ${{ matrix.python-version }} steps: - name: Checkout uses: actions/checkout@v3 - - name: Set up MPI - uses: mpi4py/setup-mpi@v1 - with: - mpi: ${{ matrix.mpi }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: @@ -47,7 +42,7 @@ jobs: - name: Install dependencies run: | # replace python version in core dependencies - sed -i 's/python=3.9/python=${{ matrix.python-version }}/' dependencies_core.yml + sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_core.yml conda env update --file dependencies_core.yml --name base conda list - name: Prepare ptypy diff --git a/cufft/dependencies.yml b/cufft/dependencies.yml index 949079d36..48f17a1e7 100644 --- a/cufft/dependencies.yml +++ b/cufft/dependencies.yml @@ -2,7 +2,7 @@ name: ptypy_cufft channels: - conda-forge dependencies: - - python=3.9 + - python - cmake>=3.8.0 - pybind11 - compilers diff --git a/dependencies_core.yml b/dependencies_core.yml index c31949627..5f0b7c13f 100644 --- a/dependencies_core.yml +++ b/dependencies_core.yml @@ -1,6 +1,6 @@ name: ptypy_core dependencies: - - python=3.9 + - python - numpy - scipy - h5py diff --git a/dependencies_dev.yml b/dependencies_dev.yml index 230a7e190..5462c145a 100644 --- a/dependencies_dev.yml +++ b/dependencies_dev.yml @@ -2,7 +2,7 @@ name: ptypy_full channels: - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - matplotlib diff --git a/dependencies_full.yml b/dependencies_full.yml index 65a1774f6..e43241fce 100644 --- a/dependencies_full.yml +++ b/dependencies_full.yml @@ -2,7 +2,7 @@ name: ptypy_full channels: - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - matplotlib diff --git a/ptypy/accelerate/base/kernels.py b/ptypy/accelerate/base/kernels.py index f3a13bad5..af1b65b11 100644 --- a/ptypy/accelerate/base/kernels.py +++ b/ptypy/accelerate/base/kernels.py @@ -109,7 +109,7 @@ def error_reduce(self, addr, err_sum): ## Actual math ## # Reduces the Fourier error along the last 2 dimensions.fd - #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) err_sum[:] = ferr.sum(-1).sum(-1) return diff --git a/ptypy/accelerate/cuda_cupy/dependencies.yml b/ptypy/accelerate/cuda_cupy/dependencies.yml index cb7d31fce..6331bbbc5 100644 --- a/ptypy/accelerate/cuda_cupy/dependencies.yml +++ b/ptypy/accelerate/cuda_cupy/dependencies.yml @@ -2,7 +2,7 @@ name: ptypy_cupy channels: - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - matplotlib diff --git a/ptypy/accelerate/cuda_pycuda/dependencies.yml b/ptypy/accelerate/cuda_pycuda/dependencies.yml index d8b9dfad9..455d60479 100644 --- a/ptypy/accelerate/cuda_pycuda/dependencies.yml +++ b/ptypy/accelerate/cuda_pycuda/dependencies.yml @@ -2,7 +2,7 @@ name: ptypy_pycuda channels: - conda-forge dependencies: - - python=3.9 + - python - numpy - scipy - matplotlib diff --git a/ptypy/accelerate/ocl_pyopencl/npy_kernels.py b/ptypy/accelerate/ocl_pyopencl/npy_kernels.py index 3c87978ae..8f09e94d0 100644 --- a/ptypy/accelerate/ocl_pyopencl/npy_kernels.py +++ b/ptypy/accelerate/ocl_pyopencl/npy_kernels.py @@ -102,7 +102,7 @@ def error_reduce(self, g_err_sum, offset=0): ## Actual math ## # Reduceses the Fourier error along the last 2 dimensions.fd - error_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + error_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) def fmag_all_update(self, pbound, g_mag, g_mask, g_err_sum, offset=0): diff --git a/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py b/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py index 85c01d4be..b8a284492 100644 --- a/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py +++ b/ptypy/accelerate/ocl_pyopencl/npy_kernels_for_block.py @@ -87,7 +87,7 @@ def error_reduce(self, addr, err_sum): ## Actual math ## # Reduceses the Fourier error along the last 2 dimensions.fd - #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + #err_sum[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) err_sum[:] = ferr.sum(-1).sum(-1) return diff --git a/ptypy/accelerate/ocl_pyopencl/ocl_fft.py b/ptypy/accelerate/ocl_pyopencl/ocl_fft.py index 26e08c298..4cfa33905 100644 --- a/ptypy/accelerate/ocl_pyopencl/ocl_fft.py +++ b/ptypy/accelerate/ocl_pyopencl/ocl_fft.py @@ -174,7 +174,7 @@ def __init__(self, queue, array, # attach scaling from reikna.transformations import mul_param - sc = mul_param(array, np.float) + sc = mul_param(array, float) ftreikna.parameter.output.connect(sc, sc.input, out=sc.output, scale=sc.param) iscale = np.sqrt(np.prod(array.shape[-2:])) if symmetric else 1.0 scale = 1.0 / iscale diff --git a/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py b/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py index 9d4b5d88a..20793ea94 100644 --- a/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py +++ b/ptypy/accelerate/ocl_pyopencl/ocl_kernels_self_contained_for_future_reference.py @@ -253,7 +253,7 @@ def ocl_fourier_error(self, f, fmag, fdev, ferr, fmask, mask_sum): self.queue.finish() def npy_error_reduce(self, ferr, err_fmag): - err_fmag[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(np.float) + err_fmag[:] = ferr.astype(np.double).sum(-1).sum(-1).astype(float) def ocl_error_reduce(self, ferr, err_fmag): shape = (self.fshape[0], 64), diff --git a/ptypy/core/sample.py b/ptypy/core/sample.py index 6f5702424..bdb14b4f3 100644 --- a/ptypy/core/sample.py +++ b/ptypy/core/sample.py @@ -361,7 +361,7 @@ def simulate(A, pars, energy, fill=1.0, prefix="", **kwargs): logger.info(prefix + "Simulation resource is a thickness profile") # Enforce floats - ob = obj.astype(np.float) + ob = obj.astype(float) ob -= ob.min() if d is not None: logger.info(prefix + "Rescaling to maximum thickness") diff --git a/ptypy/core/xy.py b/ptypy/core/xy.py index f5c2e6f8f..96d35cdaa 100644 --- a/ptypy/core/xy.py +++ b/ptypy/core/xy.py @@ -152,7 +152,7 @@ def _complete(extent, steps, spacing): elif steps is None: e = u.expect2(extent) s = u.expect2(spacing) - l = (e / s).astype(np.int) + l = (e / s).astype(int) elif spacing is None: e = u.expect2(extent) l = u.expect2(steps) diff --git a/ptypy/custom/ePIE_parallel.py b/ptypy/custom/ePIE_parallel.py index f92b1b9d0..a16e909db 100644 --- a/ptypy/custom/ePIE_parallel.py +++ b/ptypy/custom/ePIE_parallel.py @@ -179,7 +179,7 @@ def engine_prepare(self): if pod.active: self.ob_nodecover[pod.ob_view] = 1 self.nodemask = np.array(list(self.ob_nodecover.S.values())[0].data[0], - dtype=np.bool) + dtype=bool) # communicate this over MPI parallel.allreduceC(self.ob_nodecover) diff --git a/ptypy/engines/Bragg3d_engines.py b/ptypy/engines/Bragg3d_engines.py index 6e7cb3407..a12be75b6 100644 --- a/ptypy/engines/Bragg3d_engines.py +++ b/ptypy/engines/Bragg3d_engines.py @@ -167,7 +167,7 @@ def object_update(self): r = np.sqrt((x_ - xcenter)**2 + (y_ - ycenter)**2) scaling = np.min(geo.resolution) r /= scaling - r = r.astype(np.int) + r = r.astype(int) tbin = np.bincount(r.ravel(), arr.ravel()) nr = np.bincount(r.ravel()) s = np.arange(len(tbin)) * scaling diff --git a/ptypy/experiment/cSAXS.py b/ptypy/experiment/cSAXS.py index 236dba49d..59ef3689b 100644 --- a/ptypy/experiment/cSAXS.py +++ b/ptypy/experiment/cSAXS.py @@ -100,7 +100,7 @@ def check(self, frames, start): def load(self, indices): raw = {} for i in indices: - raw[i] = self.data_object.getframe(i).data.astype(np.float) + raw[i] = self.data_object.getframe(i).data.astype(float) return raw, {}, {} diff --git a/ptypy/experiment/optiklabor.py b/ptypy/experiment/optiklabor.py index 96e757067..81ed97fb6 100644 --- a/ptypy/experiment/optiklabor.py +++ b/ptypy/experiment/optiklabor.py @@ -134,7 +134,7 @@ def load_common(self): exposures =[] for j in range(self.nexp): darks,meta = u.image_read(self.info.dark_dir + '/ccd*_%02d.raw' % j) - dark_imgs.append(np.array(darks,dtype=np.float).mean(0)) + dark_imgs.append(np.array(darks,dtype=float).mean(0)) exposures.append(meta[0][self.exp_string]) # save in common dict/Param diff --git a/ptypy/simulations/detector.py b/ptypy/simulations/detector.py index 7b2fb6254..ff3ff3ad6 100644 --- a/ptypy/simulations/detector.py +++ b/ptypy/simulations/detector.py @@ -109,20 +109,20 @@ def _update(self,pars=None): def _make_mask(self): gaps = expect2(self.gaps) - module = np.ones(self.shape).astype(np.bool) + module = np.ones(self.shape).astype(bool) start = module.copy() for i in range(self.modules[0]-1): - gap = np.zeros((gaps[0],module.shape[1])).astype(np.bool) + gap = np.zeros((gaps[0],module.shape[1])).astype(bool) start = np.concatenate([start,np.concatenate([gap,module],axis=0)],axis=0) module = start.copy() for i in range(self.modules[1]-1): - gap = np.zeros((module.shape[0],gaps[1])).astype(np.bool) + gap = np.zeros((module.shape[0],gaps[1])).astype(bool) start = np.concatenate([start,np.concatenate([gap,module],axis=1)],axis=1) self._mask = start def _get_mask(self,sh): msh = expect2(sh[-2:]) - mask = np.zeros(msh).astype(np.bool) + mask = np.zeros(msh).astype(bool) offset = msh//2 - expect2(self.center) mask = fill2D(mask,self._mask,-offset) return np.resize(mask,sh) diff --git a/ptypy/utils/misc.py b/ptypy/utils/misc.py index 5a352b329..75dd377c4 100644 --- a/ptypy/utils/misc.py +++ b/ptypy/utils/misc.py @@ -341,7 +341,7 @@ def clean_path(filename): def electron_wavelength(electron_energy): - """ + r""" Calculate electron wavelength based on energy in keV: .. math:: diff --git a/ptypy/utils/scripts.py b/ptypy/utils/scripts.py index c49cb269d..97e3248b3 100644 --- a/ptypy/utils/scripts.py +++ b/ptypy/utils/scripts.py @@ -147,7 +147,7 @@ def hdr_image(img_list, exp_list, thresholds=[3000,50000], dark_list=[], img_list = [img.astype(float) for img in img_list] dark_list = [dark.astype(float) for dark in dark_list] exp_list = [float(exp) for exp in exp_list] - mask_list = [mask.astype(np.int) for mask in mask_list] + mask_list = [mask.astype(int) for mask in mask_list] for img, dark, exp,mask in zip(img_list, dark_list,exp_list,mask_list): img[:] = abs(img - dark) @@ -177,7 +177,7 @@ def hdr_image(img_list, exp_list, thresholds=[3000,50000], dark_list=[], ix[j]][themask.astype(bool)] * max_exp/exp_list[ix[j]]) else: - mask_sum = np.zeros_like(mask_list[0]).astype(np.int) + mask_sum = np.zeros_like(mask_list[0]).astype(int) img_hdr = np.zeros_like(img_list[0]) for img, exp, mask in zip(img_list,exp_list,mask_list): img = img * max_exp/exp diff --git a/test/template_tests/prep_and_run_moonflower_test.py b/test/template_tests/prep_and_run_moonflower_test.py index 114ec35ee..8f2a825b2 100644 --- a/test/template_tests/prep_and_run_moonflower_test.py +++ b/test/template_tests/prep_and_run_moonflower_test.py @@ -33,7 +33,6 @@ def test_dm_single_probe(self): p.engines.engine00.name = 'DM' p.engines.engine00.numiter = 5 P = Ptycho(p,level=5) - return P def test_dm_multiple_probes(self): p = u.Param() @@ -67,7 +66,6 @@ def test_dm_multiple_probes(self): p.engines.engine00.numiter = 5 p.engines.engine00.fourier_relax_factor = 0.05 P = Ptycho(p,level=5) - return P def test_dm_resample(self): p = u.Param() @@ -97,7 +95,6 @@ def test_dm_resample(self): p.engines.engine00.name = 'DM' p.engines.engine00.numiter = 5 P = Ptycho(p,level=5) - return P def test_ml_single_probe(self): p = u.Param() @@ -134,7 +131,6 @@ def test_ml_single_probe(self): p.engines.engine00.floating_intensities = False p.engines.engine00.numiter = 5 P = Ptycho(p,level=5) - return P def test_ml_resample(self): p = u.Param() @@ -172,7 +168,6 @@ def test_ml_resample(self): p.engines.engine00.floating_intensities = False p.engines.engine00.numiter = 5 P = Ptycho(p,level=5) - return P if __name__ == '__main__': unittest.main() From fd52d7c3b22489e445cdf5bc583a054b8a52c827 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Wed, 19 Jul 2023 12:07:05 +0100 Subject: [PATCH 09/37] Changes in numpy 1.25 (#492) * check for longdouble * convert size-1 numpy arrays to scalars * replaced np.product with np.prod --- archive/cuda_extension/python/gpu_extension.pyx | 2 +- benchmark/mpi_allreduce_speed.py | 4 ++-- ptypy/accelerate/cuda_cupy/array_utils.py | 12 ++++++------ ptypy/accelerate/cuda_cupy/cufft.py | 2 +- ptypy/accelerate/cuda_pycuda/array_utils.py | 12 ++++++------ ptypy/accelerate/cuda_pycuda/cufft.py | 6 +++--- ptypy/experiment/diamond_nexus.py | 6 +++--- ptypy/experiment/hdf5_loader.py | 6 +++--- ptypy/utils/descriptor.py | 3 ++- 9 files changed, 27 insertions(+), 26 deletions(-) diff --git a/archive/cuda_extension/python/gpu_extension.pyx b/archive/cuda_extension/python/gpu_extension.pyx index aa0b36402..f0db75587 100644 --- a/archive/cuda_extension/python/gpu_extension.pyx +++ b/archive/cuda_extension/python/gpu_extension.pyx @@ -153,7 +153,7 @@ def abs2(input): cdef np.float32_t [:,:,::1] cout_3c cdef np.float64_t [:,::1] cout_d2c cdef np.float64_t [:,:,::1] cout_d3c - cdef int n = np.product(cin.shape) + cdef int n = np.prod(cin.shape) cdef np.float32_t [:, ::1] cin_f2c cdef np.complex64_t [:, ::1] cin_c2c diff --git a/benchmark/mpi_allreduce_speed.py b/benchmark/mpi_allreduce_speed.py index 5102e35af..2e562d944 100644 --- a/benchmark/mpi_allreduce_speed.py +++ b/benchmark/mpi_allreduce_speed.py @@ -11,7 +11,7 @@ } def run_benchmark(shape): - megabytes = np.product(shape) * 8 / 1024 / 1024 * 2 + megabytes = np.prod(shape) * 8 / 1024 / 1024 * 2 data = np.zeros(shape, dtype=np.complex64) @@ -39,4 +39,4 @@ def run_benchmark(shape): print('Final results for {} processes'.format(parallel.size)) print(','.join(['Name', 'Duration', 'MB', 'MB/s'])) for r in res: - print(','.join([str(x) for x in r])) \ No newline at end of file + print(','.join([str(x) for x in r])) diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py index 911c6111d..9c68d9431 100644 --- a/ptypy/accelerate/cuda_cupy/array_utils.py +++ b/ptypy/accelerate/cuda_cupy/array_utils.py @@ -279,15 +279,15 @@ def delxf(self, input, out, axis=-1): self.queue.use() if axis == input.ndim - 1: - flat_dim = np.int32(np.product(input.shape[0:-1])) + flat_dim = np.int32(np.prod(input.shape[0:-1])) self.delxf_last(( int((flat_dim + self.last_axis_block[1] - 1) // self.last_axis_block[1]), 1, 1), self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis]))) else: - lower_dim = np.int32(np.product(input.shape[(axis+1):])) - higher_dim = np.int32(np.product(input.shape[:axis])) + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) gx = int( (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) gy = 1 @@ -306,14 +306,14 @@ def delxb(self, input, out, axis=-1): if self.queue is not None: self.queue.use() if axis == input.ndim - 1: - flat_dim = np.int32(np.product(input.shape[0:-1])) + flat_dim = np.int32(np.prod(input.shape[0:-1])) self.delxb_last(( int((flat_dim + self.last_axis_block[1] - 1) // self.last_axis_block[1]), 1, 1), self.last_axis_block, (input, out, flat_dim, np.int32(input.shape[axis]))) else: - lower_dim = np.int32(np.product(input.shape[(axis+1):])) - higher_dim = np.int32(np.product(input.shape[:axis])) + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) gx = int( (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) gy = 1 diff --git a/ptypy/accelerate/cuda_cupy/cufft.py b/ptypy/accelerate/cuda_cupy/cufft.py index 794efb858..707aba2f7 100644 --- a/ptypy/accelerate/cuda_cupy/cufft.py +++ b/ptypy/accelerate/cuda_cupy/cufft.py @@ -23,7 +23,7 @@ def __init__(self, array, queue=None, if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: raise ValueError( "CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") - self.batches = int(np.product( + self.batches = int(np.prod( array.shape[0:dims-2]) if dims > 2 else 1) self.forward = forward diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 2abd02ba4..72eae996f 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -270,7 +270,7 @@ def delxf(self, input, out, axis=-1): axis = np.int32(axis) if axis == input.ndim - 1: - flat_dim = np.int32(np.product(input.shape[0:-1])) + flat_dim = np.int32(np.prod(input.shape[0:-1])) self.delxf_last(input, out, flat_dim, np.int32(input.shape[axis]), block=self.last_axis_block, grid=( @@ -280,8 +280,8 @@ def delxf(self, input, out, axis=-1): stream=self.queue ) else: - lower_dim = np.int32(np.product(input.shape[(axis+1):])) - higher_dim = np.int32(np.product(input.shape[:axis])) + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) gx = int( (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) gy = 1 @@ -301,7 +301,7 @@ def delxb(self, input, out, axis=-1): axis = np.int32(axis) if axis == input.ndim - 1: - flat_dim = np.int32(np.product(input.shape[0:-1])) + flat_dim = np.int32(np.prod(input.shape[0:-1])) self.delxb_last(input, out, flat_dim, np.int32(input.shape[axis]), block=self.last_axis_block, grid=( @@ -311,8 +311,8 @@ def delxb(self, input, out, axis=-1): stream=self.queue ) else: - lower_dim = np.int32(np.product(input.shape[(axis+1):])) - higher_dim = np.int32(np.product(input.shape[:axis])) + lower_dim = np.int32(np.prod(input.shape[(axis+1):])) + higher_dim = np.int32(np.prod(input.shape[:axis])) gx = int( (lower_dim + self.mid_axis_block[0] - 1) // self.mid_axis_block[0]) gy = 1 diff --git a/ptypy/accelerate/cuda_pycuda/cufft.py b/ptypy/accelerate/cuda_pycuda/cufft.py index d10e82b1a..4859b36b2 100644 --- a/ptypy/accelerate/cuda_pycuda/cufft.py +++ b/ptypy/accelerate/cuda_pycuda/cufft.py @@ -21,7 +21,7 @@ def __init__(self, array, queue=None, columns = self.arr_shape[1] if rows != columns or rows not in [16, 32, 64, 128, 256, 512, 1024, 2048]: raise ValueError("CUDA FFT only supports powers of 2 for rows/columns, from 16 to 2048") - self.batches = int(np.product(array.shape[0:dims-2]) if dims > 2 else 1) + self.batches = int(np.prod(array.shape[0:dims-2]) if dims > 2 else 1) self.forward = forward self._load(array, pre_fft, post_fft, symmetric, forward) @@ -121,11 +121,11 @@ def _load(self, array, pre_fft, post_fft, symmetric, forward): ) # with cuFFT, we need to scale ifft if not symmetric and not forward: - self.scale = 1 / np.product(self.arr_shape) + self.scale = 1 / np.prod(self.arr_shape) elif forward and not symmetric: self.scale = 1.0 else: - self.scale = 1 / np.sqrt(np.product(self.arr_shape)) + self.scale = 1 / np.sqrt(np.prod(self.arr_shape)) if pre_fft is not None: self.pre_fft = gpuarray.to_gpu(pre_fft) diff --git a/ptypy/experiment/diamond_nexus.py b/ptypy/experiment/diamond_nexus.py index 318887756..5b2f534c1 100644 --- a/ptypy/experiment/diamond_nexus.py +++ b/ptypy/experiment/diamond_nexus.py @@ -214,18 +214,18 @@ def __init__(self, pars=None, **kwargs): if None not in [INPUT_FILE, ENERGY_KEY]: - self.p.energy = float(h5.File(INPUT_FILE, 'r')[ENERGY_KEY][()] * self.ENERGY_MULTIPLIER) + self.p.energy = float(h5.File(INPUT_FILE, 'r')[ENERGY_KEY][()].item() * self.ENERGY_MULTIPLIER) self.meta.energy = self.p.energy log(3, "loading energy={} from file".format(self.p.energy)) if None not in [INPUT_FILE, DISTANCE_KEY]: - self.p.distance = h5.File(INPUT_FILE, 'r')[DISTANCE_KEY][()] + self.p.distance = h5.File(INPUT_FILE, 'r')[DISTANCE_KEY][()].item() self.meta.distance = self.p.distance log(3, "loading distance={} from file".format(self.p.distance)) if None not in [INPUT_FILE, PIXEL_SIZE_KEY]: - self.p.psize = h5.File(INPUT_FILE, 'r')[PIXEL_SIZE_KEY][()] + self.p.psize = h5.File(INPUT_FILE, 'r')[PIXEL_SIZE_KEY][()].item() self.meta.psize = self.p.psize log(3, "loading psize={} from file".format(self.p.psize)) diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py index 80ecc2b87..c72949d70 100644 --- a/ptypy/experiment/hdf5_loader.py +++ b/ptypy/experiment/hdf5_loader.py @@ -539,20 +539,20 @@ def _prepare_meta_info(self): if self._is_spectro_scan and self.p.outer_index is not None: self.p.energy = float(f[self.p.recorded_energy.key][self.p.outer_index]) else: - self.p.energy = float(f[self.p.recorded_energy.key][()]) + self.p.energy = float(f[self.p.recorded_energy.key][()].item()) self.p.energy = self.p.energy * self.p.recorded_energy.multiplier + self.p.recorded_energy.offset self.meta.energy = self.p.energy log(3, "loading energy={} from file".format(self.p.energy)) if None not in [self.p.recorded_distance.file, self.p.recorded_distance.key]: with h5.File(self.p.recorded_distance.file, 'r', swmr=self._is_swmr) as f: - self.p.distance = float(f[self.p.recorded_distance.key][()] * self.p.recorded_distance.multiplier) + self.p.distance = float(f[self.p.recorded_distance.key][()].item() * self.p.recorded_distance.multiplier) self.meta.distance = self.p.distance log(3, "loading distance={} from file".format(self.p.distance)) if None not in [self.p.recorded_psize.file, self.p.recorded_psize.key]: with h5.File(self.p.recorded_psize.file, 'r', swmr=self._is_swmr) as f: - self.p.psize = float(f[self.p.recorded_psize.key][()] * self.p.recorded_psize.multiplier) + self.p.psize = float(f[self.p.recorded_psize.key][()].item() * self.p.recorded_psize.multiplier) self.info.psize = self.p.psize log(3, "loading psize={} from file".format(self.p.psize)) diff --git a/ptypy/utils/descriptor.py b/ptypy/utils/descriptor.py index 554dc65b2..714a581dd 100644 --- a/ptypy/utils/descriptor.py +++ b/ptypy/utils/descriptor.py @@ -853,7 +853,8 @@ def _walk(self, depth=0, pars=None, ignore_symlinks=False, ignore_wildcards=Fals (type(pars).__name__ == 'tuple' and 'list' in self.type) or \ (type(pars).__name__ == 'list' and 'tuple' in self.type) or \ (type(pars).__name__ == 'int' and 'float' in self.type) or \ - (type(pars).__name__[:5] == 'float' and 'float' in self.type): + (type(pars).__name__[:5] == 'float' and 'float' in self.type) or \ + (type(pars).__name__ == 'longdouble' and 'float' in self.type): yield {'d': self, 'path': path, 'status': 'ok', 'info': ''} else: yield {'d': self, 'path': path, 'status': 'wrongtype', 'info': type(pars).__name__} From cc0932baae9daca254122d1d3979a350178d005c Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Wed, 19 Jul 2023 13:22:51 +0100 Subject: [PATCH 10/37] Properly clean up accelerated ML engines to allow chaining (#491) --- ptypy/accelerate/base/engines/ML_serial.py | 1 + templates/engines/moonflower_ML_ML.py | 71 +++++++++++++++++++ templates/engines/moonflower_ML_ML_pycuda.py | 72 ++++++++++++++++++++ 3 files changed, 144 insertions(+) create mode 100644 templates/engines/moonflower_ML_ML.py create mode 100644 templates/engines/moonflower_ML_ML_pycuda.py diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 248110326..38f63f385 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -348,6 +348,7 @@ def engine_finalize(self): prep = self.diff_info[d.ID] float_intens_coeff[label] = prep.float_intens_coeff self.ptycho.runtime["float_intens"] = parallel.gather_dict(float_intens_coeff) + super().engine_finalize() class BaseModelSerial(BaseModel): diff --git a/templates/engines/moonflower_ML_ML.py b/templates/engines/moonflower_ML_ML.py new file mode 100644 index 000000000..fed34ca6a --- /dev/null +++ b/templates/engines/moonflower_ML_ML.py @@ -0,0 +1,71 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" +#import ptypy +from ptypy.core import Ptycho +from ptypy import utils as u + +import tempfile +tmpdir = tempfile.gettempdir() + +p = u.Param() + +# for verbose output +p.verbose_level = "info" + +# set home path +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) + +# saving intermediate results +p.io.autosave = u.Param(active=False) + +# opens plotting GUI if interaction set to active) +p.io.autoplot = u.Param(active=False) +p.io.interaction = u.Param(active=False) + +# max 100 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'ML' +p.engines.engine00.ML_type = 'Gaussian' +p.engines.engine00.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer +p.engines.engine00.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used +p.engines.engine00.scale_precond = True +#p.engines.engine00.scale_probe_object = 1. +p.engines.engine00.smooth_gradient = 20. +p.engines.engine00.smooth_gradient_decay = 1/50. +p.engines.engine00.floating_intensities = False +p.engines.engine00.numiter = 300 + +p.engines.engine01 = u.Param() +p.engines.engine01.name = 'ML' +p.engines.engine01.numiter = 20 +p.engines.engine01.numiter_contiguous = 5 +p.engines.engine01.reg_del2 = False +p.engines.engine01.reg_del2_amplitude = 1. +p.engines.engine01.floating_intensities = False +p.engines.engine01.probe_support = 0.5 + +# prepare and run +if __name__ == "__main__": + P = Ptycho(p,level=5) diff --git a/templates/engines/moonflower_ML_ML_pycuda.py b/templates/engines/moonflower_ML_ML_pycuda.py new file mode 100644 index 000000000..d506ec5ae --- /dev/null +++ b/templates/engines/moonflower_ML_ML_pycuda.py @@ -0,0 +1,72 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines(arch="cuda") + +import tempfile +tmpdir = tempfile.gettempdir() + +p = u.Param() + +# for verbose output +p.verbose_level = "info" +p.frames_per_block = 400 +# set home path +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False) +p.io.interaction = u.Param(active=False) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 100 +p.scans.MF.data.save = None + +p.scans.MF.illumination = u.Param(diversity=None) +p.scans.MF.coherence = u.Param(num_probe_modes=1) +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'ML_pycuda' +p.engines.engine00.numiter = 300 +p.engines.engine00.numiter_contiguous = 5 +p.engines.engine00.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer +p.engines.engine00.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used +p.engines.engine00.scale_precond = True +p.engines.engine00.smooth_gradient = 20. +p.engines.engine00.smooth_gradient_decay = 1/50. +p.engines.engine00.floating_intensities = False + +p.engines.engine01 = u.Param() +p.engines.engine01.name = 'ML_pycuda' +p.engines.engine01.numiter = 20 +p.engines.engine01.numiter_contiguous = 5 +p.engines.engine01.reg_del2 = False +p.engines.engine01.reg_del2_amplitude = 1. +p.engines.engine01.floating_intensities = False +p.engines.engine01.probe_support = 0.5 + +# prepare and run +if __name__ == "__main__": + P = Ptycho(p,level=5) From 6cad7cee7f11f842f47dd702587d0144fdcffb32 Mon Sep 17 00:00:00 2001 From: Jari Date: Wed, 19 Jul 2023 16:12:47 +0100 Subject: [PATCH 11/37] Add Euclidean noise model for ML (#486) --- ptypy/engines/ML.py | 178 +++++++++++++++++++++- templates/engines/moonflower_ML_Euclid.py | 62 ++++++++ 2 files changed, 234 insertions(+), 6 deletions(-) create mode 100644 templates/engines/moonflower_ML_Euclid.py diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py index fe08995ce..e7492b42f 100644 --- a/ptypy/engines/ML.py +++ b/ptypy/engines/ML.py @@ -45,7 +45,7 @@ class ML(PositionCorrectionEngine): type = str help = Likelihood model choices = ['gaussian','poisson','euclid'] - doc = One of ‘gaussian’, poisson’ or ‘euclid’. Only 'gaussian' is implemented. + doc = One of ‘gaussian’, poisson’ or ‘euclid’. [floating_intensities] default = False @@ -99,7 +99,7 @@ class ML(PositionCorrectionEngine): type = int lowlim = 0 help = Number of iterations before probe update starts - + """ SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull] @@ -153,7 +153,7 @@ def __init__(self, ptycho_parent, pars=None): def engine_initialize(self): """ Prepare for ML reconstruction. - """ + """ super(ML, self).engine_initialize() # Object gradient and minimization direction @@ -182,7 +182,7 @@ def _initialize_model(self): elif self.p.ML_type.lower() == "poisson": self.ML_model = PoissonModel(self) elif self.p.ML_type.lower() == "euclid": - raise NotImplementedError('Euclid norm model not yet implemented') + self.ML_model = EuclidModel(self) else: raise RuntimeError("Unsupported ML_type: '%s'" % self.p.ML_type) @@ -235,7 +235,7 @@ def engine_iterate(self, num=1): cn2_new_pr_grad = Cnorm2(new_pr_grad) cn2_new_ob_grad = Cnorm2(new_ob_grad) if cn2_new_pr_grad > 1e-5: - scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad + scale_p_o = (self.p.scale_probe_object * cn2_new_ob_grad / cn2_new_pr_grad) else: scale_p_o = self.p.scale_probe_object @@ -601,7 +601,7 @@ class PoissonModel(BaseModel): def __init__(self, MLengine): """ - Core functions for ML computation using a Gaussian model. + Core functions for ML computation using a Poisson model. """ BaseModel.__init__(self, MLengine) from scipy import special @@ -745,6 +745,172 @@ def poly_line_coeffs(self, ob_h, pr_h): return B +class EuclidModel(BaseModel): + """ + Euclidean (Amplitude) noise model. + TODO: feed actual statistical weights instead of using a fixed variance. + """ + + def __init__(self, MLengine): + """ + Core functions for ML computation using a Euclidean model. + """ + BaseModel.__init__(self, MLengine) + + # Euclidean model requires weights + # TODO: update this part of the code once actual weights are passed in the PODs + self.weights = self.engine.di.copy(self.engine.di.ID + '_weights') + # FIXME: This part needs to be updated once statistical weights are properly + # supported in the data preparation. + for name, di_view in self.di.views.items(): + if not di_view.active: + continue + self.weights[di_view] = di_view.pod.ma_view.data # just the mask for now + #self.weights[di_view] = (di_view.pod.ma_view.data + # / (1. + stat_weights/di_view.data)) + + def __del__(self): + """ + Clean up routine + """ + BaseModel.__del__(self) + del self.engine.ptycho.containers[self.weights.ID] + del self.weights + + def new_grad(self): + """ + Compute a new gradient direction according to a Euclidean noise model. + + Note: The negative log-likelihood and local errors are also computed + here. + """ + self.ob_grad.fill(0.) + self.pr_grad.fill(0.) + + # We need an array for MPI + LL = np.array([0.]) + error_dct = {} + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + Amodel = np.zeros_like(A) + f = {} + + # First pod loop: compute total amplitude + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f[name] = pod.fw(pod.probe * pod.object) + Amodel += np.sqrt(u.abs2(f[name])) + + # Floating intensity option + if self.p.floating_intensities: + self.float_intens_coeff[dname] = A.sum() / Amodel.sum() + Amodel *= self.float_intens_coeff[dname] + + Amodel += 1e-6 # cf Poisson model + DA = (1. - A / Amodel) + + # Second pod loop: gradients computation + LLL = np.sum((w * (Amodel - A)**2).astype(np.float64)) + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + xi = pod.bw(w*DA * f[name]) + self.ob_grad[pod.ob_view] += 2. * xi * pod.probe.conj() + self.pr_grad[pod.pr_view] += 2. * xi * pod.object.conj() + + diff_view.error = LLL + error_dct[dname] = np.array([0, LLL / np.prod(DA.shape), 0]) + LL += LLL + + # MPI reduction of gradients + self.ob_grad.allreduce() + self.pr_grad.allreduce() + parallel.allreduce(LL) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + self.ob_grad.storages[name].data += self.regularizer.grad( + s.data) + LL += self.regularizer.LL + self.LL = LL / self.tot_measpts + + return error_dct + + def poly_line_coeffs(self, ob_h, pr_h): + """ + Compute the coefficients of the polynomial for line minimization + in direction h + """ + + B = np.zeros((3,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + A0 = None + A1 = None + A2 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view] + + pr_h[pod.pr_view] * pod.object) + b = pod.fw(pr_h[pod.pr_view] * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = (2 * np.real(f * b.conj()).astype(np.longdouble) + + u.abs2(a).astype(np.longdouble)) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += 2 * np.real(f * b.conj()) + u.abs2(a) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + + A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6 + DA = 1. - A/np.sqrt(A0) + + B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm + B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm + B[2] += (np.dot(w.flat, (A2*DA).flat) + .25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + self.B = B + + return B + + class Regul_del2(object): """\ Squared gradient regularizer (Gaussian prior). diff --git a/templates/engines/moonflower_ML_Euclid.py b/templates/engines/moonflower_ML_Euclid.py new file mode 100644 index 000000000..edcb076f0 --- /dev/null +++ b/templates/engines/moonflower_ML_Euclid.py @@ -0,0 +1,62 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" +#import ptypy +from ptypy.core import Ptycho +from ptypy import utils as u + +import tempfile +tmpdir = tempfile.gettempdir() + +p = u.Param() + +# for verbose output +p.verbose_level = "info" + +# set home path +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) + +# saving intermediate results +p.io.autosave = u.Param(active=False) + +# opens plotting GUI if interaction set to active) +p.io.autoplot = u.Param(active=True) +p.io.interaction = u.Param(active=True) + +# max 100 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 200 +p.scans.MF.data.save = None + +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'ML' +p.engines.engine00.ML_type = 'Euclid' +p.engines.engine00.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer +p.engines.engine00.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used +p.engines.engine00.scale_precond = True +#p.engines.engine00.scale_probe_object = 1. +p.engines.engine00.smooth_gradient = 20. +p.engines.engine00.smooth_gradient_decay = 1/50. +p.engines.engine00.floating_intensities = False +p.engines.engine00.numiter = 300 + +# prepare and run +if __name__ == "__main__": + P = Ptycho(p,level=5) From f6a33761d37d78c5baa63f5a76cb0161c1052195 Mon Sep 17 00:00:00 2001 From: Timothy Poon <62692924+ptim0626@users.noreply.github.com> Date: Wed, 19 Jul 2023 18:23:49 +0100 Subject: [PATCH 12/37] Count CPU number as usable by current process but not whole system (#493) --- ptypy/experiment/hdf5_loader.py | 46 ++++++++++++++++----------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py index c72949d70..44344477f 100644 --- a/ptypy/experiment/hdf5_loader.py +++ b/ptypy/experiment/hdf5_loader.py @@ -55,14 +55,14 @@ class Hdf5Loader(PtyScan): [positions] default = type = Param - help = Parameters for the position information data. - doc = Shapes for each axis that are currently covered and tested corresponding + help = Parameters for the position information data. + doc = Shapes for each axis that are currently covered and tested corresponding to the intensity shapes are: * axis_data.shape (A, B) for data.shape (A, B, frame_size_m, frame_size_n), * axis_data.shape (k,) for data.shape (k, frame_size_m, frame_size_n), * axis_data.shape (C, D) for data.shape (C*D, frame_size_m, frame_size_n) , * axis_data.shape (C,) for data.shape (C, D, frame_size_m, frame_size_n) where D is the - size of the other axis, and + size of the other axis, and * axis_data.shape (C,) for data.shape (C*D, frame_size_m, frame_size_n) where D is the size of the other axis. @@ -114,7 +114,7 @@ class Hdf5Loader(PtyScan): [mask] default = type = Param - help = Parameters for mask data. + help = Parameters for mask data. doc = The shape of the loaded data is assumed to be (frame_size_m, frame_size_n) or the same shape of the full intensities data. @@ -153,7 +153,7 @@ class Hdf5Loader(PtyScan): [darkfield] default = type = Param - help = Parameters for darkfield data. + help = Parameters for darkfield data. doc = The shape is assumed to be (frame_size_m, frame_size_n) or the same shape of the full intensities data. @@ -190,7 +190,7 @@ class Hdf5Loader(PtyScan): help = Sigma value applied for automatic detection of outliers in the normalisation dataset. [framefilter] - default = + default = type = Param help = Parameters for the filtering of frames doc = The shape of loaded data is assumed to hvae the same dimensionality as data.shape[:-2] @@ -198,7 +198,7 @@ class Hdf5Loader(PtyScan): [framefilter.file] default = None type = str - help = This is the path to the file containing the filter information. + help = This is the path to the file containing the filter information. [framefilter.key] default = None @@ -210,7 +210,7 @@ class Hdf5Loader(PtyScan): type = Param help = This parameter contains information if we are use the recorded energy rather than as a parameter. It should be a scalar value. - + [recorded_energy.file] default = None type = str @@ -236,7 +236,7 @@ class Hdf5Loader(PtyScan): type = Param help = This parameter contains information if we are use the recorded distance to the detector rather than as a parameter, It should be a scalar value. - + [recorded_distance.file] default = None type = str @@ -257,7 +257,7 @@ class Hdf5Loader(PtyScan): type = Param help = This parameter contains information if we are use the recorded psize to the detector rather than as a parameter, It should be a scalar value. - + [recorded_psize.file] default = None type = str @@ -296,8 +296,8 @@ class Hdf5Loader(PtyScan): type = bool default = False help = Switch for loading data from electron ptychography experiments. - doc = If True, the energy provided in keV will be considered as electron energy - and converted to electron wavelengths. + doc = If True, the energy provided in keV will be considered as electron energy + and converted to electron wavelengths. """ def __init__(self, pars=None, swmr=False, **kwargs): @@ -330,7 +330,7 @@ def __init__(self, pars=None, swmr=False, **kwargs): self.framefilter = None self._is_spectro_scan = False self._is_swmr = swmr - + self.fhandle_intensities = None self.fhandle_positions_fast = None self.fhandle_positions_slow = None @@ -549,7 +549,7 @@ def _prepare_meta_info(self): self.p.distance = float(f[self.p.recorded_distance.key][()].item() * self.p.recorded_distance.multiplier) self.meta.distance = self.p.distance log(3, "loading distance={} from file".format(self.p.distance)) - + if None not in [self.p.recorded_psize.file, self.p.recorded_psize.key]: with h5.File(self.p.recorded_psize.file, 'r', swmr=self._is_swmr) as f: self.p.psize = float(f[self.p.recorded_psize.key][()].item() * self.p.recorded_psize.multiplier) @@ -870,7 +870,7 @@ def _finalize(self): class Hdf5LoaderFast(Hdf5Loader): def __init__(self, pars=None, **kwargs): super().__init__(pars=pars, **kwargs) - self.cpu_count_per_rank = max(os.cpu_count() // parallel.size,1) + self.cpu_count_per_rank = max(len(os.sched_getaffinity(0)) // parallel.size,1) print("Rank %d has access to %d processes" %(parallel.rank, self.cpu_count_per_rank)) self.intensities_array = None self.weights_array = None @@ -886,13 +886,13 @@ def subtract_dark(raw, dark): return corr @staticmethod - def _init_worker(intensities_raw_array, weights_raw_array, + def _init_worker(intensities_raw_array, weights_raw_array, intensities_handle, weights_handle, darkfield_handle, flatfield_handle, intensities_dtype, weights_dtype, - array_shape, + array_shape, mask_laid_out_like_data, darkfield_laid_out_like_data, flatfield_laid_out_like_data): @@ -909,7 +909,7 @@ def _init_worker(intensities_raw_array, weights_raw_array, @staticmethod def _read_intensities_and_weights(slices): ''' - Copy intensities/weights into memory and correct for + Copy intensities/weights into memory and correct for darkfield/flatfield if they exist ''' indexed_frame_slices, dest_slices = slices @@ -961,7 +961,7 @@ def _setup_raw_intensity_buffer(self, dtype, sh): return self._intensities_raw_array = RawArray(np.ctypeslib.as_ctypes_type(dtype), npixels) self.intensities_array = np.frombuffer(self._intensities_raw_array, self.intensities_dtype, -1).reshape(sh) - + def _setup_raw_weights_buffer(self, dtype, sh): npixels = int(np.prod(sh)) if (self.weights_array is not None) and (self.weights_array.size == npixels): @@ -979,13 +979,13 @@ def load_multiprocessing(self, src_slices): self._setup_raw_weights_buffer(self.mask_dtype, sh) dest_slices = [np.s_[i:i+1] for i in range(len(src_slices))] - with Pool(self.cpu_count_per_rank, + with Pool(self.cpu_count_per_rank, initializer=Hdf5LoaderFast._init_worker, initargs=(self._intensities_raw_array, self._weights_raw_array, self.intensities, self.mask, self.darkfield, self.flatfield, self.intensities_dtype, self.mask_dtype, sh, self.mask_laid_out_like_data, - self.darkfield_laid_out_like_data, + self.darkfield_laid_out_like_data, self.flatfield_field_laid_out_like_data)) as p: p.map(self._read_intensities_and_weights, zip(src_slices, dest_slices)) @@ -1013,7 +1013,7 @@ def load_unmapped_raster_scan(self, indices): self.fast_axis[slow_idx, fast_idx] * self.p.positions.fast_multiplier]) log(3, 'Data loaded successfully.') return intensities, positions, weights - + def load_mapped_and_raster_scan(self, indices): slices = [] @@ -1024,7 +1024,7 @@ def load_mapped_and_raster_scan(self, indices): if self._is_spectro_scan and self.p.outer_index is not None: indexed_frame_slices = (self.p.outer_index,) + indexed_frame_slices slices.append(indexed_frame_slices) - + self.load_multiprocessing(slices) intensities = {} From 771cfb674d00a8ba6d45af707493cf7cd3aa7c94 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 10 Aug 2023 14:28:04 +0100 Subject: [PATCH 13/37] fix indentation for benchmarks --- ptypy/core/ptycho.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py index f4034e23e..9686823a3 100644 --- a/ptypy/core/ptycho.py +++ b/ptypy/core/ptycho.py @@ -532,7 +532,7 @@ def init_data(self, print_stats=True): with LogTime(self.p.io.benchmark == 'all') as t: while not self.new_data: self.new_data = self.model.new_data() - if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration + if (self.p.io.benchmark == 'all') and parallel.master: self.benchmark.data_load += t.duration # Print stats parallel.barrier() From 94f3b83b80c18b6caabbd2560d58861621e55831 Mon Sep 17 00:00:00 2001 From: Benedikt Daurer Date: Thu, 10 Aug 2023 14:31:35 +0100 Subject: [PATCH 14/37] dump numbers for benchmarks --- benchmark/diamond_benchmarks/moonflower_scripts/i08.py | 1 + benchmark/diamond_benchmarks/moonflower_scripts/i13.py | 1 + benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py | 1 + benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py | 1 + 4 files changed, 4 insertions(+) diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i08.py b/benchmark/diamond_benchmarks/moonflower_scripts/i08.py index 273a8ecbf..193c5693e 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i08.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i08.py @@ -28,6 +28,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i13.py b/benchmark/diamond_benchmarks/moonflower_scripts/i13.py index 1cf42d5e4..edb0cd1e4 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i13.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i13.py @@ -28,6 +28,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py index 9d1abcccb..eaa848f4a 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_1.py @@ -28,6 +28,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() diff --git a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py index 8e3c7241e..fcf483c47 100644 --- a/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py +++ b/benchmark/diamond_benchmarks/moonflower_scripts/i14_2.py @@ -29,6 +29,7 @@ p.io.autoplot = u.Param(active=False) p.io.interaction = u.Param() p.io.interaction.server = u.Param(active=False) +p.io.benchmark = "all" # max 200 frames (128x128px) of diffraction data p.scans = u.Param() From 148239f1313983cb6437ff5b30bb2c3e238fa6d9 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Thu, 12 Oct 2023 15:55:30 +0100 Subject: [PATCH 15/37] Update CONTRIB.rst (#508) Co-authored-by: Bjoern Enders --- CONTRIB.rst | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/CONTRIB.rst b/CONTRIB.rst index a9aada0a4..1f26aebf5 100644 --- a/CONTRIB.rst +++ b/CONTRIB.rst @@ -26,24 +26,25 @@ Please ensure you satisfy most of PEP8_ recommendations. We are not dogmatic abo Testing ^^^^^^^ -Not much testing exists at the time of writing this document, but we are aware that this is something that should change. If you want to contribute code, it would be very good practice to also submit related tests. +All tests are in the (``/test/``) folder and our CI pipeline runs these test for every commit (?). Please note that tests that require GPUs are disabled for the CI pipeline. Make sure to supply tests for new code or drastic changes to the existing code base. Smaller commits or bug fixes don't require an extra test. Branches ^^^^^^^^ +We are following the Gitflow https://www.atlassian.com/git/tutorials/comparing-workflows/gitflow-workflow development model where a development branch (``dev``) is merged into the master branch for every release. Individual features are developed on topic branches from the development branch and squash-merged back into it when the feature is mature + The important permanent branches are: - - ``master``: the current cutting-edge but functional package. - - ``stable``: the latest release, recommended for production use. - - ``target``: target for a next release. This branch should stay up-to-date with ``master``, and contain planned updates that will break compatibility with the current version. - - other thematic and temporary branches will appear and disappear as new ideas are tried out and merged in. + - ``master``: (protected) the current release plus bugfixes / hotpatches. + - ``dev``: (protected) current branch for all developments. Features are branched this branch and merged back into it upon completion. Development cycle ^^^^^^^^^^^^^^^^^ -There has been only two releases of the code up to now, so what we can tell about the *normal development cycle* for |ptypy| is rather limited. However the plan is as follows: - - Normal development usually happens on thematic branches. These branches are merged back to master when it is clear that (1) the feature is sufficiently debugged and tested and (2) no current functionality will break. - - At regular interval admins will decide to freeze the development for a new stable release. During this period, development will be allowed only on feature branches but master will accept only bug fixes. Once the stable release is done, development will continue. +|ptypy| does not follow a rigid release schedule. Releases are prepared for major event or when a set of features have reached maturity. + + - Normal development usually happens on thematic branches from the ``dev`` branch. These branches are merged back to ``dev`` when it is clear that (1) the feature is sufficiently debugged and tested and (2) no current functionality will break. + - For a release the dev branch will be merged back into master and that merge tagged as a release. 3. Pull requests @@ -51,16 +52,9 @@ There has been only two releases of the code up to now, so what we can tell abou Most likely you are a member of the |ptypy| team, which give you access to the full repository, but no right to commit changes. The proper way of doing this is *pull requests*. You can read about how this is done on github's `pull requests tutorial`_. -Pull requests can be made against one of the feature branches, or against ``target`` or ``master``. In the latter cases, if your changes are deemed a bit too substantial, the first thing we will do is create a feature branch for your commits, and we will let it live for a little while, making sure that it is all fine. We will then merge it onto ``master`` (or ``target``). - -In principle bug fixes can be requested on the ``stable`` branch. - -3. Direct commits ------------------ - -If you are one of our power-users (or power-developers), you can be given rights to commit directly to |ptypy|. This makes things much simpler of course, but with great power comes great responsibility. +Pull requests shall be made against one of the feature branches, or against ``dev`` or ``master``. For PRs against master we will only accept bugifxes or smaller changes. Every other PR should be made against ``dev``. Your PR will be reviewed and discussed anmongst the core developer team. The more you touch core libraries, the more scrutiny your PR will face. However, we created two folders in the main source folder where you have mmore freedom to try out things. For example, if you want to provide a new reconstruction engine, place it into the ``custom/`` folder. A new ``PtyScan`` subclass that prepares data from your experiment is best placed in the ``experiment/`` folder. -To make sure that things are done cleanly, we encourage all the core developers to create thematic remote branches instead of committing always onto master. Merging these thematic branches will be done as a collective decision during one of the regular admin meetings. +If you develop a new feature on a topic branch, it is your responsibility to keep it current with dev branch to avoid merge conflicts. .. |ptypy| replace:: PtyPy @@ -68,4 +62,4 @@ To make sure that things are done cleanly, we encourage all the core developers .. _PEP8: https://www.python.org/dev/peps/pep-0008/ -.. _`pull requests tutorial`: https://help.github.com/articles/using-pull-requests/ \ No newline at end of file +.. _`pull requests tutorial`: https://help.github.com/articles/using-pull-requests/ From c544fefde6dafba46c53b64acccab543aa8ec8a2 Mon Sep 17 00:00:00 2001 From: Thomas Milburn Date: Thu, 12 Oct 2023 15:59:37 +0100 Subject: [PATCH 16/37] Updated docstrings which are missing choices (#507) * Added more choices to comments --------- Co-authored-by: Thomas Milburn --- archive/cuda_extension/engines/DM_gpu.py | 1 + archive/cuda_extension/engines/DM_npy.py | 1 + archive/engines/DM.py | 1 + ptypy/core/data.py | 2 ++ ptypy/core/illumination.py | 1 + ptypy/core/manager.py | 4 ++++ ptypy/core/ptycho.py | 4 ++++ ptypy/core/sample.py | 1 + ptypy/engines/base.py | 1 + ptypy/engines/projectional.py | 1 + 10 files changed, 17 insertions(+) diff --git a/archive/cuda_extension/engines/DM_gpu.py b/archive/cuda_extension/engines/DM_gpu.py index 399eb143c..9e81ad7fa 100644 --- a/archive/cuda_extension/engines/DM_gpu.py +++ b/archive/cuda_extension/engines/DM_gpu.py @@ -57,6 +57,7 @@ class DMGpu(DMNpy): default = 'linear' type = str help = Subpixel interpolation; 'fourier','linear' or None for no interpolation + choices = ['fourier','linear',None] [update_object_first] default = True diff --git a/archive/cuda_extension/engines/DM_npy.py b/archive/cuda_extension/engines/DM_npy.py index f601a46dd..6fce4bc5d 100644 --- a/archive/cuda_extension/engines/DM_npy.py +++ b/archive/cuda_extension/engines/DM_npy.py @@ -55,6 +55,7 @@ class DMNpy(DM): default = 'linear' type = str help = Subpixel interpolation; 'fourier','linear' or None for no interpolation + choices = ['fourier','linear',None] [update_object_first] default = True diff --git a/archive/engines/DM.py b/archive/engines/DM.py index 50936bd42..1124158a2 100644 --- a/archive/engines/DM.py +++ b/archive/engines/DM.py @@ -55,6 +55,7 @@ class DM(PositionCorrectionEngine): default = 'linear' type = str help = Subpixel interpolation; 'fourier','linear' or None for no interpolation + choices = ['fourier','linear',None] [update_object_first] default = True diff --git a/ptypy/core/data.py b/ptypy/core/data.py index 636857bca..288def4bf 100644 --- a/ptypy/core/data.py +++ b/ptypy/core/data.py @@ -114,6 +114,7 @@ class PtyScan(object): default = data help = Determines what will be loaded in parallel doc = Choose from ``None``, ``'data'``, ``'common'``, ``'all'`` + choices = ['data', 'common', 'all'] [rebin] type = int @@ -139,6 +140,7 @@ class PtyScan(object): Alternatively, a 3-tuple of booleans may be provided ``(do_transpose, do_flipud, do_fliplr)`` + choices = [0, 1, 2, 3, 4, 5, 6, 7] userlevel = 1 [min_frames] diff --git a/ptypy/core/illumination.py b/ptypy/core/illumination.py index af5d4c06b..ddf1f98f7 100644 --- a/ptypy/core/illumination.py +++ b/ptypy/core/illumination.py @@ -130,6 +130,7 @@ - *