ptycho · daurer · Mar 11, 2024 · Mar 5, 2024 · Mar 5, 2024 · Mar 5, 2024
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -24,24 +24,39 @@ jobs:
       max-parallel: 10
       fail-fast: false
       matrix:
-        python-version: ['3.8','3.9','3.10', '3.11']
-    name: Testing with Python ${{ matrix.python-version }} 
+        python-version: # There is a bug in conda 24.1.2 for Python < 3.12 so we pin the version to 23.11.0
+          - python: "3.8"
+            conda: "23.11.0"
+          - python: "3.9"
+            conda: "23.11.0"
+          - python: "3.10"
+            conda: "23.11.0"
+          - python: "3.11"
+            conda: "23.11.0"
+          - python: "3.12"
+            conda: "latest"        
+    name: Testing with Python ${{ matrix.python-version.python }} 
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - name: Set up Python ${{ matrix.python-version }}
+    - name: Set up Python ${{ matrix.python-version.python }}
       uses: actions/setup-python@v5
       with:
-        python-version:  ${{ matrix.python-version }}
+        python-version:  ${{ matrix.python-version.python }}
     - name: Add conda to system path
       run: |
         # $CONDA is an environment variable pointing to the root of the miniconda directory
         echo $CONDA/bin >> $GITHUB_PATH
         conda --version
+    - name: Change conda version if necessary
+      if: ${{ matrix.python-version.conda != 'latest' }}
+      run: |
+        conda install conda=${{ matrix.python-version.conda }} python=${{ matrix.python-version.python }}
+        conda --version
     - name: Install dependencies
       run: |
         # replace python version in core dependencies
-        sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_core.yml
+        sed -i 's/python/python=${{ matrix.python-version.python }}/' dependencies_core.yml
         conda env update --file dependencies_core.yml --name base
         conda list	
     - name: Prepare ptypy

diff --git a/README.rst b/README.rst
@@ -21,8 +21,8 @@ PtyPy - Ptychography Reconstruction for Python
 
 |ptypysite|
 
-.. image:: https://github.com/ptycho/ptypy/workflows/ptypy%20tests/badge.svg?branch=master
-    :target: https://github.com/ptycho/ptypy/actions
+.. image:: https://github.com/ptycho/ptypy/actions/workflows/test.yml/badge.svg?branch=master
+    :target: https://github.com/ptycho/ptypy/actions/workflows/test.yml
 
 Welcome Ptychonaut!
 -------------------
@@ -62,7 +62,8 @@ Features
 
     $ mpiexec -n [nodes] python <your_ptypy_script>.py
 
-* **GPU acceleration** based on custom kernels, pycuda, and reikna.
+* **GPU acceleration** based on custom kernels, CuPy or PyCUDA/reikna.
+  See examples in ``templates/accelerate``, ``templates/engines/cupy`` and ``templates/engines/pycuda``.
 
 * A **client-server** approach for visualization and control based on 
   `ZeroMQ <http://www.zeromq.org>`_ .
@@ -100,6 +101,17 @@ Ptypy depends on standard python packages:
  * mpi4py (optional - required for parallel computing)
  * pyzmq (optional - required for the plotting client)
 
+GPU support
+-----------
+
+We support an accelerated version of |ptypy| for CUDA-capable GPUs based on our own kernels and the
+`CuPy <https://cupy.dev/>`_ package. We recommend to install the dependencies for this version like so.
+::
+
+    $ conda env create -f accelerate/cuda_cupy/dependencies.yml
+    $ conda activate ptypy_cupy
+    (ptypy_cupy)$ pip install .
+
 
 Quicklinks
 ----------

diff --git a/cufft/dependencies.yml b/cufft/dependencies.yml
@@ -1,10 +1,14 @@
 name: ptypy_cufft
 channels:
   - conda-forge
+  - nvidia
 dependencies:
   - python
   - cmake>=3.8.0
   - pybind11
   - compilers
-  - cudatoolkit-dev
+  - cuda-nvcc
+  - cuda-cudart-dev
+  - libcufft-dev
+  - libcufft-static
   - pip
diff --git a/cufft/extensions.py b/cufft/extensions.py
@@ -40,6 +40,11 @@ def locate_cuda():
     cudaconfig = {'home': home, 'nvcc': nvcc,
                   'include': os.path.join(home, 'include'),
                   'lib64': os.path.join(home, 'lib64')}
+
+    # If lib64 does not exist, try lib instead (as common in conda env)
+    if not os.path.exists(cudaconfig['lib64']):
+        cudaconfig['lib64'] = os.path.join(home, 'lib')
+
     for k, v in cudaconfig.items():
         if not os.path.exists(v):
             raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))

diff --git a/cufft/setup.py b/cufft/setup.py
@@ -24,12 +24,19 @@
     )
     cmdclass = {"build_ext": CustomBuildExt}
     EXTBUILD_MESSAGE = "The filtered cufft extension has been successfully installed.\n"
-except:
+except EnvironmentError as e:
     EXTBUILD_MESSAGE = '*' * 75 + "\n"
     EXTBUILD_MESSAGE += "Could not install the filtered cufft extension.\n"
-    EXTBUILD_MESSAGE += "Make sure to have CUDA >= 10 and pybind11 installed.\n"
+    EXTBUILD_MESSAGE += "Make sure to have CUDA >= 10 installed.\n"
     EXTBUILD_MESSAGE += '*' * 75 + "\n"
-
+    EXTBUILD_MESSAGE += 'Error message: ' + str(e)
+except ImportError as e:
+    EXTBUILD_MESSAGE = '*' * 75 + "\n"
+    EXTBUILD_MESSAGE += "Could not install the filtered cufft extension.\n"
+    EXTBUILD_MESSAGE += "Make sure to have pybind11 installed.\n"
+    EXTBUILD_MESSAGE += '*' * 75 + "\n"
+    EXTBUILD_MESSAGE += 'Error message: ' + str(e)
+
 exclude_packages = []
 package_list = setuptools.find_packages(exclude=exclude_packages)
 setup(

diff --git a/doc/html_templates/ptypysphinx/download.html b/doc/html_templates/ptypysphinx/download.html
@@ -20,7 +20,8 @@
 <p>
 If you are having trouble getting ptypy up und running please let us know (
 <a href="https://github.com/pierrethibault">Pierre</a> or
-<a href="https://github.com/bjoernenders">Bjoern</a>).
+<a href="https://github.com/bjoernenders">Bjoern</a> or 
+<a href="https://github.com/daurer">Benedikt</a>).
 </p><p> You are also encouraged to file any issue, bug or request at the 
 <a href="www.github.com/ptycho/ptypy/issues">issue tracker</a>.
 </p>

diff --git a/doc/index.rst b/doc/index.rst
@@ -35,7 +35,8 @@ Highlights
 
     $ mpiexec/mpirun -n [nodes] python <your_ptypy_script>.py
 
-* **GPU acceleration** based on custom kernels, pycuda, and reikna.
+* **GPU acceleration** based on custom kernels, CuPy or PyCUDA/reikna.
+  See examples in ``templates/accelerate``, ``templates/engines/cupy`` and ``templates/engines/pycuda``.
 
 * A **client-server** approach for visualization and control based on 
   `ZeroMQ <http://www.zeromq.org>`_ .

diff --git a/doc/rst_templates/getting_started.tmp b/doc/rst_templates/getting_started.tmp
@@ -86,11 +86,26 @@ Install the recommended version like so
     (ptypy_full)$ pip install .
 
 
-Recommended install for GPU support
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+Recommended install for GPU support with CuPy
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 We support an accelerated version of |ptypy|_ for CUDA-capable
 GPUs based on our own kernels and the
+`CuPy <https://cupy.dev/>`_ package.
+
+Install the dependencies for this version like so.
+::
+
+    $ conda env create -f ptypy/accelerate/cuda_cupy/dependencies.yml
+    $ conda activate ptypy_cupy
+    (ptypy_cupy)$ pip install .
+
+
+Install for GPU support with PyCUDA
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Alternatively, we also support an accelerated version of |ptypy|_ for CUDA-capable
+GPUs based on our own kernels and the
 `PyCUDA <https://pypi.org/project/pycuda/>`_ package.
 
 Install the dependencies for this version like so.
@@ -101,17 +116,22 @@ Install the dependencies for this version like so.
     (ptypy_pycuda)$ pip install .
 
 
-While we use `Reikna <https://pypi.org/project/reikna/>`_ to
-provide a filtered FFT, i.e. a FFT that is fused with a pointwise
-matrix multiplication, you can optionally also install a version
-based on cufft and callbacks. Due to the nature of this extension it
-needs to be built for fixed array sizes externally.
+We use `Reikna <https://pypi.org/project/reikna/>`_ to
+provide a filtered FFT, i.e. a FFT that is fused with a pointwise matrix multiplication. 
+
+Optional installation of filtered cufft
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For optimal performance with both CuPy and PyCUDA engines, you can optionally install a version 
+of the filtered FFT based on cufft and callbacks. Due to the nature of this extension it
+needs to be built for fixed array sizes externally and currently supports array 
+sizes of 16, 32, 64, 128, 256, 512, 1024 and 2048.
 ::
 
-    $ conda activate ptypy_pycuda
-    (ptypy_pycuda)$ cd cufft
-    (ptypy_pycuda)$ conda env update --file dependencies.yml --name ptypy_pycuda
-    (ptypy_pycuda)$ pip install .
+    $ conda activate ptypy_cupy
+    (ptypy_cupy)$ cd cufft
+    (ptypy_cupy)$ conda env update --file dependencies.yml --name ptypy_cupy
+    (ptypy_cupy)$ pip install .
 
 
 Optional packages

diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py
@@ -56,6 +56,16 @@ def norm2(input):
     return np.sum(abs2(input))
 
 
+def gaussian_kernel_2d(shape, sigmau, sigmav):
+    """
+    2D Gaussian kernel using the last 2 dimension of given shape
+    Requires sigma for both dimensions (sigmau and sigmav)
+    """
+    u, v = np.fft.fftfreq(shape[-2]), np.fft.fftfreq(shape[-1])
+    uu, vv = np.meshgrid(u, v, sparse=True, indexing='ij')
+    kernel = np.exp(-2* ( (np.pi*sigmau)**2 * uu**2 + (np.pi*sigmav)**2 * vv**2 ) )
+    return kernel
+
 def complex_gaussian_filter(input, mfs):
     '''
     takes 2D and 3D arrays. Complex input, complex output. mfs has len 0<x<=2
@@ -70,6 +80,44 @@ def complex_gaussian_filter(input, mfs):
         input.dtype)
 
 
+def complex_gaussian_filter_fft(input, mfs):
+    '''
+    takes 2D and 3D arrays. Complex input, complex output. mfs has len 0<x<=2
+    '''
+    if len(mfs) > 2:
+        raise NotImplementedError("Only batches of 2D arrays allowed!")
+    elif len(mfs) == 1:
+        mfs = np.array([mfs,mfs])
+    else:
+        mfs = np.array(mfs)
+
+    k = gaussian_kernel_2d(input.shape, mfs[0], mfs[1]).astype(input.dtype)
+    return fft_filter(input, k)
+
+
+def fft_filter(input, kernel, prefactor=None, postfactor=None, forward=True):
+    """
+    Compute
+    output = ifft(fft( prefactor * input ) * kernel) * postfactor
+    """
+    # Make a copy (and cast if necessary)
+    x = np.array(input)
+
+
+    if prefactor is not None:
+        x *= prefactor
+
+    if forward:
+        x = np.fft.ifftn(np.fft.fftn(x, norm="ortho") * kernel, norm="ortho")
+    else:
+        x = np.fft.fftn(np.fft.ifftn(x, norm="ortho") * kernel, norm="ortho")
+
+    if postfactor is not None:
+        x *= postfactor
+
+    return x
+
+
 def mass_center(A):
     '''
     Input will always be real, and 2d or 3d, single precision here
@@ -81,7 +129,7 @@ def interpolated_shift(c, shift, do_linear=False):
     '''
     complex bicubic interpolated shift.
     complex output. This shift should be applied to 2D arrays. shift should have len=c.ndims 
-    
+
     '''
     if not do_linear:
         return ndi.shift(np.real(c), shift, order=3, prefilter=True) + 1j * ndi.shift(

diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py
@@ -23,13 +23,23 @@
 from ptypy.engines import register
 from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel
 from ptypy.accelerate.base import address_manglers
+from ptypy.accelerate.base.array_utils import complex_gaussian_filter, complex_gaussian_filter_fft
 
 
 __all__ = ['ML_serial']
 
 @register()
 class ML_serial(ML):
 
+    """
+    Defaults:
+
+    [smooth_gradient_method]
+    default = convolution
+    type = str
+    help = Method to be used for smoothing the gradient, choose between ```convolution``` or ```fft```.
+    """
+
     def __init__(self, ptycho_parent, pars=None):
         """
         Maximum likelihood reconstruction engine.
@@ -143,7 +153,12 @@ def engine_prepare(self):
         self.ML_model.prepare()
 
     def _get_smooth_gradient(self, data, sigma):
-        return self.smooth_gradient(data)
+        if self.p.smooth_gradient_method == "convolution":
+            return complex_gaussian_filter(data, sigma)
+        elif self.p.smooth_gradient_method == "fft":
+            return complex_gaussian_filter_fft(data, sigma)
+        else:
+            raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.")
 
     def _replace_ob_grad(self):
         new_ob_grad = self.ob_grad_new
@@ -272,7 +287,7 @@ def engine_iterate(self, num=1):
         return error_dct  # np.array([[self.ML_model.LL[0]] * 3])
 
     def position_update(self):
-        """ 
+        """
         Position refinement
         """
         if not self.do_position_refinement:
@@ -283,7 +298,7 @@ def position_update(self):
         # Update positions
         if do_update_pos:
             """
-            Iterates through all positions and refines them by a given algorithm. 
+            Iterates through all positions and refines them by a given algorithm.
             """
             log(4, "----------- START POS REF -------------")
             for dID in self.di.S.keys():
@@ -308,7 +323,7 @@ def position_update(self):
                 max_oby = ob.shape[-2] - aux.shape[-2] - 1
                 max_obx = ob.shape[-1] - aux.shape[-1] - 1
 
-                # We need to re-calculate the current error 
+                # We need to re-calculate the current error
                 PCK.build_aux(aux, addr, ob, pr)
                 aux[:] = FW(aux)
                 PCK.log_likelihood_ml(aux, addr, I, w, err_phot)
@@ -338,7 +353,7 @@ def engine_finalize(self):
                 for i,view in enumerate(d.views):
                     for j,(pname, pod) in enumerate(view.pods.items()):
                         delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res
-                        pod.ob_view.coord += delta 
+                        pod.ob_view.coord += delta
                         pod.ob_view.storage.update_views(pod.ob_view)
             self.ptycho.record_positions = True
 

diff --git a/ptypy/accelerate/cuda_common/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu
@@ -22,10 +22,11 @@ extern "C" __global__ void batched_multiply(const complex<IN_TYPE>* input,
   int gy = threadIdx.y + blockIdx.y * blockDim.y;
   int gz = threadIdx.z + blockIdx.z * blockDim.z;
 
-  if (gx > columns || gy > rows || gz > nBatches)
+  if (gx > rows - 1 || gy > columns - 1 || gz > nBatches)
     return;
 
   auto val = input[gz * rows * columns + gy * rows + gx];
+
   if (MPY_DO_FILT)  // set at compile-time
   {
     val *= filter[gy * rows + gx];