From ad79d1a132ffa39d7ae324af65cbef0722ed8fb8 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 5 Mar 2024 09:19:04 +0000 Subject: [PATCH 01/14] allow reading pos from single entry (#537) --- ptypy/experiment/hdf5_loader.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/ptypy/experiment/hdf5_loader.py b/ptypy/experiment/hdf5_loader.py index ac54caa8..72846e73 100644 --- a/ptypy/experiment/hdf5_loader.py +++ b/ptypy/experiment/hdf5_loader.py @@ -81,6 +81,11 @@ class Hdf5Loader(PtyScan): type = float help = Multiplicative factor that converts motor positions to metres. + [positions.slow_index] + default = None + type = int + help = Index along the final dimension + [positions.fast_key] default = None type = str @@ -91,6 +96,11 @@ class Hdf5Loader(PtyScan): type = float help = Multiplicative factor that converts motor positions to metres. + [positions.fast_index] + default = None + type = int + help = Index along the final dimension + [positions.bounding_box] default = type = Param @@ -425,12 +435,16 @@ def _prepare_intensity_and_positions(self): self.fast_axis = self.fhandle_positions_fast[self.p.positions.fast_key] if self._is_spectro_scan and self.p.outer_index is not None: self.fast_axis = self.fast_axis[self.p.outer_index] + if self.p.positions.fast_index is not None: + self.fast_axis = self.fast_axis[:,self.p.positions.fast_index] self.positions_fast_shape = np.squeeze(self.fast_axis).shape if self.fast_axis.ndim > 2 else self.fast_axis.shape self.fhandle_positions_slow = h5.File(self.p.positions.file, 'r', swmr=self._is_swmr) self.slow_axis = self.fhandle_positions_slow[self.p.positions.slow_key] if self._is_spectro_scan and self.p.outer_index is not None: self.slow_axis = self.slow_axis[self.p.outer_index] + if self.p.positions.slow_index is not None: + self.slow_axis = self.slow_axis[:,self.p.positions.slow_index] self.positions_slow_shape = np.squeeze(self.slow_axis).shape if self.slow_axis.ndim > 2 else self.slow_axis.shape log(3, "The shape of the \n\tdiffraction intensities is: {}\n\tslow axis data:{}\n\tfast axis data:{}".format(self.data_shape, From cf77c69947063d9693fde023edcae041c9cd53c1 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 5 Mar 2024 09:19:23 +0000 Subject: [PATCH 02/14] make a copy of runtime in restore (#536) --- ptypy/core/ptycho.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ptypy/core/ptycho.py b/ptypy/core/ptycho.py index 18fd22c7..61dc1ccd 100644 --- a/ptypy/core/ptycho.py +++ b/ptypy/core/ptycho.py @@ -1151,7 +1151,7 @@ def restore_state(self, name="baseline", reformat_exit=True): S.data[:] = self.state_dict[name]["ob"].storages[ID].data for ID,S in self.exit.storages.items(): S.data[:] = self.state_dict[name]["ex"].storages[ID].data - self.runtime = self.state_dict[name]["runtime"] + self.runtime = self.state_dict[name]["runtime"].copy(depth=99) # Reformat/Recalculate exit waves if reformat_exit: From eb76faa07dc62cd595d194f8f45d1563cec0044f Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 5 Mar 2024 09:19:44 +0000 Subject: [PATCH 03/14] Fix pycuda templates to only load pycuda engines (#538) * load only pycuda engines in pycuda templates * load only pycuda engines in more templates --- .../ptypy_i13_AuStar_farfield_pycuda.py | 2 +- .../ptypy_i13_AuStar_nearfield_pycuda.py | 2 +- .../ptypy_id22ni_AuStar_focused_pycuda.py | 2 +- .../ptypy_laser_logo_focused_pycuda.py | 2 +- .../ptypy_minimal_prep_and_run_pycuda.py | 2 +- .../engines/pycuda/moonflower_DM_ML_pycuda.py | 2 +- .../engines/pycuda/moonflower_DM_pycuda.py | 2 +- .../pycuda/moonflower_DM_pycuda_nostream.py | 2 +- .../pycuda/moonflower_EPIE_ML_pycuda.py | 2 +- .../engines/pycuda/moonflower_EPIE_pycuda.py | 2 +- .../engines/pycuda/moonflower_ML_ML_pycuda.py | 2 +- .../engines/pycuda/moonflower_ML_pycuda.py | 2 +- .../pycuda/moonflower_RAAR_ML_pycuda.py | 2 +- .../engines/pycuda/moonflower_RAAR_pycuda.py | 2 +- .../engines/pycuda/moonflower_SDR_pycuda.py | 2 +- .../moonflower_DM_delayed_pycuda.py | 2 +- .../notebooks/moonflower_dm_pycuda.ipynb | 62 ++++--------------- .../notebooks/moonflower_epie_pycuda.ipynb | 2 +- .../notebooks/moonflower_ml_pycuda.ipynb | 62 ++++--------------- .../notebooks/moonflower_raar_pycuda.ipynb | 2 +- .../moonflower_posref_DM_pycuda.py | 2 +- .../moonflower_posref_EPIE_pycuda.py | 2 +- .../moonflower_posref_ML_pycuda.py | 2 +- .../moonflower_posref_SDR_pycuda.py | 2 +- 24 files changed, 44 insertions(+), 124 deletions(-) diff --git a/templates/accelerate/ptypy_i13_AuStar_farfield_pycuda.py b/templates/accelerate/ptypy_i13_AuStar_farfield_pycuda.py index 865f5d9c..635cd7a6 100644 --- a/templates/accelerate/ptypy_i13_AuStar_farfield_pycuda.py +++ b/templates/accelerate/ptypy_i13_AuStar_farfield_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/accelerate/ptypy_i13_AuStar_nearfield_pycuda.py b/templates/accelerate/ptypy_i13_AuStar_nearfield_pycuda.py index 671116fa..6a63853f 100644 --- a/templates/accelerate/ptypy_i13_AuStar_nearfield_pycuda.py +++ b/templates/accelerate/ptypy_i13_AuStar_nearfield_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/accelerate/ptypy_id22ni_AuStar_focused_pycuda.py b/templates/accelerate/ptypy_id22ni_AuStar_focused_pycuda.py index fb9c3920..1d3a39b0 100644 --- a/templates/accelerate/ptypy_id22ni_AuStar_focused_pycuda.py +++ b/templates/accelerate/ptypy_id22ni_AuStar_focused_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import numpy as np import tempfile diff --git a/templates/accelerate/ptypy_laser_logo_focused_pycuda.py b/templates/accelerate/ptypy_laser_logo_focused_pycuda.py index 9628bd97..3d0b44b7 100644 --- a/templates/accelerate/ptypy_laser_logo_focused_pycuda.py +++ b/templates/accelerate/ptypy_laser_logo_focused_pycuda.py @@ -7,7 +7,7 @@ from ptypy import utils as u import ptypy.simulations as sim import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import pathlib import numpy as np diff --git a/templates/accelerate/ptypy_minimal_prep_and_run_pycuda.py b/templates/accelerate/ptypy_minimal_prep_and_run_pycuda.py index d28643f5..347768e5 100644 --- a/templates/accelerate/ptypy_minimal_prep_and_run_pycuda.py +++ b/templates/accelerate/ptypy_minimal_prep_and_run_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_DM_ML_pycuda.py b/templates/engines/pycuda/moonflower_DM_ML_pycuda.py index ae70d903..84050fdb 100644 --- a/templates/engines/pycuda/moonflower_DM_ML_pycuda.py +++ b/templates/engines/pycuda/moonflower_DM_ML_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_DM_pycuda.py b/templates/engines/pycuda/moonflower_DM_pycuda.py index b4842bc3..0ba01ff3 100644 --- a/templates/engines/pycuda/moonflower_DM_pycuda.py +++ b/templates/engines/pycuda/moonflower_DM_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_DM_pycuda_nostream.py b/templates/engines/pycuda/moonflower_DM_pycuda_nostream.py index 3c2a702b..3fe26627 100644 --- a/templates/engines/pycuda/moonflower_DM_pycuda_nostream.py +++ b/templates/engines/pycuda/moonflower_DM_pycuda_nostream.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_EPIE_ML_pycuda.py b/templates/engines/pycuda/moonflower_EPIE_ML_pycuda.py index a0b1e8c9..2269a161 100644 --- a/templates/engines/pycuda/moonflower_EPIE_ML_pycuda.py +++ b/templates/engines/pycuda/moonflower_EPIE_ML_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_EPIE_pycuda.py b/templates/engines/pycuda/moonflower_EPIE_pycuda.py index e96ee91d..aaf78701 100644 --- a/templates/engines/pycuda/moonflower_EPIE_pycuda.py +++ b/templates/engines/pycuda/moonflower_EPIE_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_ML_ML_pycuda.py b/templates/engines/pycuda/moonflower_ML_ML_pycuda.py index d506ec5a..87b7a6dd 100644 --- a/templates/engines/pycuda/moonflower_ML_ML_pycuda.py +++ b/templates/engines/pycuda/moonflower_ML_ML_pycuda.py @@ -7,7 +7,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_ML_pycuda.py b/templates/engines/pycuda/moonflower_ML_pycuda.py index 4e3ce15f..2938275f 100644 --- a/templates/engines/pycuda/moonflower_ML_pycuda.py +++ b/templates/engines/pycuda/moonflower_ML_pycuda.py @@ -7,7 +7,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_RAAR_ML_pycuda.py b/templates/engines/pycuda/moonflower_RAAR_ML_pycuda.py index 28e565f8..39634dd1 100644 --- a/templates/engines/pycuda/moonflower_RAAR_ML_pycuda.py +++ b/templates/engines/pycuda/moonflower_RAAR_ML_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_RAAR_pycuda.py b/templates/engines/pycuda/moonflower_RAAR_pycuda.py index 52b9c539..0575f23b 100644 --- a/templates/engines/pycuda/moonflower_RAAR_pycuda.py +++ b/templates/engines/pycuda/moonflower_RAAR_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/engines/pycuda/moonflower_SDR_pycuda.py b/templates/engines/pycuda/moonflower_SDR_pycuda.py index d76c7689..68390804 100644 --- a/templates/engines/pycuda/moonflower_SDR_pycuda.py +++ b/templates/engines/pycuda/moonflower_SDR_pycuda.py @@ -6,7 +6,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/live_processing/moonflower_DM_delayed_pycuda.py b/templates/live_processing/moonflower_DM_delayed_pycuda.py index ab2bcc41..29e9ac8f 100644 --- a/templates/live_processing/moonflower_DM_delayed_pycuda.py +++ b/templates/live_processing/moonflower_DM_delayed_pycuda.py @@ -7,7 +7,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/notebooks/moonflower_dm_pycuda.ipynb b/templates/notebooks/moonflower_dm_pycuda.ipynb index 42937721..8143ef4c 100644 --- a/templates/notebooks/moonflower_dm_pycuda.ipynb +++ b/templates/notebooks/moonflower_dm_pycuda.ipynb @@ -12,11 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "import ptypy\n", @@ -27,25 +23,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Import pycuda engines (DM, RAAR)\n", - "ptypy.load_gpu_engines(\"cuda\")" + "ptypy.load_gpu_engines(\"pycuda\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# create parameter tree\n", @@ -55,11 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# set verbose level to interactive\n", @@ -69,11 +53,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Nr. of frames in a block\n", @@ -83,11 +63,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# set home path and io settings (no files saved)\n", @@ -101,11 +77,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# max 200 frames (128x128px) of diffraction data\n", @@ -125,11 +97,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# difference map reconstrucion engine\n", @@ -144,11 +112,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# prepare and run\n", @@ -165,11 +129,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "import ptypy.utils.plot_client as pc\n", diff --git a/templates/notebooks/moonflower_epie_pycuda.ipynb b/templates/notebooks/moonflower_epie_pycuda.ipynb index 290d424e..66bc396f 100644 --- a/templates/notebooks/moonflower_epie_pycuda.ipynb +++ b/templates/notebooks/moonflower_epie_pycuda.ipynb @@ -27,7 +27,7 @@ "outputs": [], "source": [ "# Import pycuda engines (DM, RAAR)\n", - "ptypy.load_gpu_engines(\"cuda\")" + "ptypy.load_gpu_engines(\"pycuda\")" ] }, { diff --git a/templates/notebooks/moonflower_ml_pycuda.ipynb b/templates/notebooks/moonflower_ml_pycuda.ipynb index e5e9b3d5..f266f8c8 100644 --- a/templates/notebooks/moonflower_ml_pycuda.ipynb +++ b/templates/notebooks/moonflower_ml_pycuda.ipynb @@ -12,11 +12,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "import ptypy\n", @@ -27,25 +23,17 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Import pycuda engines (DM, RAAR)\n", - "ptypy.load_gpu_engines(\"cuda\")" + "ptypy.load_gpu_engines(\"pycuda\")" ] }, { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# create parameter tree\n", @@ -55,11 +43,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# set verbose level to interactive\n", @@ -69,11 +53,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Nr. of frames in a block\n", @@ -83,11 +63,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# set home path and io settings (no files saved)\n", @@ -101,11 +77,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# max 200 frames (128x128px) of diffraction data\n", @@ -125,11 +97,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# maximum likelihood reconstrucion engine\n", @@ -149,11 +117,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "# prepare and run\n", @@ -170,11 +134,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "vscode": { - "languageId": "python" - } - }, + "metadata": {}, "outputs": [], "source": [ "import ptypy.utils.plot_client as pc\n", diff --git a/templates/notebooks/moonflower_raar_pycuda.ipynb b/templates/notebooks/moonflower_raar_pycuda.ipynb index 879428e8..e8bb338c 100644 --- a/templates/notebooks/moonflower_raar_pycuda.ipynb +++ b/templates/notebooks/moonflower_raar_pycuda.ipynb @@ -27,7 +27,7 @@ "outputs": [], "source": [ "# Import pycuda engines (DM, RAAR)\n", - "ptypy.load_gpu_engines(\"cuda\")" + "ptypy.load_gpu_engines(\"pycuda\")" ] }, { diff --git a/templates/position_refinement/moonflower_posref_DM_pycuda.py b/templates/position_refinement/moonflower_posref_DM_pycuda.py index 50ec2da3..dc2b288b 100644 --- a/templates/position_refinement/moonflower_posref_DM_pycuda.py +++ b/templates/position_refinement/moonflower_posref_DM_pycuda.py @@ -7,7 +7,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/position_refinement/moonflower_posref_EPIE_pycuda.py b/templates/position_refinement/moonflower_posref_EPIE_pycuda.py index 39cc3b43..d3d61919 100644 --- a/templates/position_refinement/moonflower_posref_EPIE_pycuda.py +++ b/templates/position_refinement/moonflower_posref_EPIE_pycuda.py @@ -7,7 +7,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/position_refinement/moonflower_posref_ML_pycuda.py b/templates/position_refinement/moonflower_posref_ML_pycuda.py index b7bc7b75..0b0cf2cd 100644 --- a/templates/position_refinement/moonflower_posref_ML_pycuda.py +++ b/templates/position_refinement/moonflower_posref_ML_pycuda.py @@ -7,7 +7,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() diff --git a/templates/position_refinement/moonflower_posref_SDR_pycuda.py b/templates/position_refinement/moonflower_posref_SDR_pycuda.py index e13cd58d..880b2c54 100644 --- a/templates/position_refinement/moonflower_posref_SDR_pycuda.py +++ b/templates/position_refinement/moonflower_posref_SDR_pycuda.py @@ -7,7 +7,7 @@ from ptypy.core import Ptycho from ptypy import utils as u import ptypy -ptypy.load_gpu_engines(arch="cuda") +ptypy.load_gpu_engines(arch="pycuda") import tempfile tmpdir = tempfile.gettempdir() From ebf8deb53f9b1915ff681c3004067518e7e12a9e Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 5 Mar 2024 09:20:07 +0000 Subject: [PATCH 04/14] update dependencies for cupy/pycuda (#539) --- ptypy/accelerate/cuda_cupy/dependencies.yml | 6 +++--- ptypy/accelerate/cuda_pycuda/dependencies.yml | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ptypy/accelerate/cuda_cupy/dependencies.yml b/ptypy/accelerate/cuda_cupy/dependencies.yml index 6331bbbc..9b803c65 100644 --- a/ptypy/accelerate/cuda_cupy/dependencies.yml +++ b/ptypy/accelerate/cuda_cupy/dependencies.yml @@ -11,7 +11,7 @@ dependencies: - mpi4py - pillow - pyfftw - - cupy - - cudatoolkit-dev - pip - - compilers \ No newline at end of file + - compilers + - cupy + \ No newline at end of file diff --git a/ptypy/accelerate/cuda_pycuda/dependencies.yml b/ptypy/accelerate/cuda_pycuda/dependencies.yml index 455d6047..181634f1 100644 --- a/ptypy/accelerate/cuda_pycuda/dependencies.yml +++ b/ptypy/accelerate/cuda_pycuda/dependencies.yml @@ -1,6 +1,7 @@ name: ptypy_pycuda channels: - conda-forge + - nvidia dependencies: - python - numpy @@ -14,8 +15,7 @@ dependencies: - pyyaml - reikna - pycuda - - cudatoolkit-dev + - cuda-nvcc + - cuda-cudart-dev - pip - compilers - - pip: - - scikit-cuda \ No newline at end of file From c9d7bacf70acbccd4321127ff76a19475797e9df Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 5 Mar 2024 09:20:28 +0000 Subject: [PATCH 05/14] update dependencies and better error handling (#540) --- cufft/dependencies.yml | 6 +++++- cufft/extensions.py | 5 +++++ cufft/setup.py | 13 ++++++++++--- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/cufft/dependencies.yml b/cufft/dependencies.yml index 48f17a1e..2b081a06 100644 --- a/cufft/dependencies.yml +++ b/cufft/dependencies.yml @@ -1,10 +1,14 @@ name: ptypy_cufft channels: - conda-forge + - nvidia dependencies: - python - cmake>=3.8.0 - pybind11 - compilers - - cudatoolkit-dev + - cuda-nvcc + - cuda-cudart-dev + - libcufft-dev + - libcufft-static - pip \ No newline at end of file diff --git a/cufft/extensions.py b/cufft/extensions.py index 545b43d0..4d8b7f59 100644 --- a/cufft/extensions.py +++ b/cufft/extensions.py @@ -40,6 +40,11 @@ def locate_cuda(): cudaconfig = {'home': home, 'nvcc': nvcc, 'include': os.path.join(home, 'include'), 'lib64': os.path.join(home, 'lib64')} + + # If lib64 does not exist, try lib instead (as common in conda env) + if not os.path.exists(cudaconfig['lib64']): + cudaconfig['lib64'] = os.path.join(home, 'lib') + for k, v in cudaconfig.items(): if not os.path.exists(v): raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v)) diff --git a/cufft/setup.py b/cufft/setup.py index 5108ebf3..a8ef7c61 100644 --- a/cufft/setup.py +++ b/cufft/setup.py @@ -24,12 +24,19 @@ ) cmdclass = {"build_ext": CustomBuildExt} EXTBUILD_MESSAGE = "The filtered cufft extension has been successfully installed.\n" -except: +except EnvironmentError as e: EXTBUILD_MESSAGE = '*' * 75 + "\n" EXTBUILD_MESSAGE += "Could not install the filtered cufft extension.\n" - EXTBUILD_MESSAGE += "Make sure to have CUDA >= 10 and pybind11 installed.\n" + EXTBUILD_MESSAGE += "Make sure to have CUDA >= 10 installed.\n" EXTBUILD_MESSAGE += '*' * 75 + "\n" - + EXTBUILD_MESSAGE += 'Error message: ' + str(e) +except ImportError as e: + EXTBUILD_MESSAGE = '*' * 75 + "\n" + EXTBUILD_MESSAGE += "Could not install the filtered cufft extension.\n" + EXTBUILD_MESSAGE += "Make sure to have pybind11 installed.\n" + EXTBUILD_MESSAGE += '*' * 75 + "\n" + EXTBUILD_MESSAGE += 'Error message: ' + str(e) + exclude_packages = [] package_list = setuptools.find_packages(exclude=exclude_packages) setup( From 4b1a7b2b7ff20d3b63e2fd26014273472a31dab5 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 5 Mar 2024 09:20:55 +0000 Subject: [PATCH 06/14] Fixed link in actions badge (#541) --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index edf712d7..bafea120 100644 --- a/README.rst +++ b/README.rst @@ -21,8 +21,8 @@ PtyPy - Ptychography Reconstruction for Python |ptypysite| -.. image:: https://github.com/ptycho/ptypy/workflows/ptypy%20tests/badge.svg?branch=master - :target: https://github.com/ptycho/ptypy/actions +.. image:: https://github.com/ptycho/ptypy/actions/workflows/test.yml/badge.svg?branch=master + :target: https://github.com/ptycho/ptypy/actions/workflows/test.yml Welcome Ptychonaut! ------------------- From 9155c5df179faaca0ab79303ec39e009b84c9d16 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Tue, 5 Mar 2024 14:01:00 +0000 Subject: [PATCH 07/14] gather_dict on local error is big bottleneck for large datasets (#527) * testing idea of skipping gather_dict * apply error allreduce to other engines * added new reduced error logic to accelerated stochastic engines --- .../cuda_cupy/engines/projectional_cupy.py | 22 +++++++++++++--- .../engines/projectional_cupy_stream.py | 26 ++++++++++++------- .../cuda_cupy/engines/stochastic.py | 24 +++++++++++++---- .../engines/projectional_pycuda.py | 22 +++++++++++++--- .../engines/projectional_pycuda_stream.py | 24 ++++++++++++----- .../cuda_pycuda/engines/stochastic.py | 21 ++++++++++++--- ptypy/engines/base.py | 16 ++++++++---- 7 files changed, 118 insertions(+), 37 deletions(-) diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py index 45eb4d01..ab56df5f 100644 --- a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py +++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy.py @@ -209,7 +209,11 @@ def engine_iterate(self, num=1): queue.use() for it in range(num): - error = {} + + reduced_error = np.zeros((3,)) + reduced_error_count = 0 + local_error = {} + for dID in self.di.S.keys(): # find probe, object and exit ID in dependence of dID @@ -294,9 +298,19 @@ def engine_iterate(self, num=1): err_phot = prep.err_phot_gpu.get() err_exit = prep.err_exit_gpu.get() errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) - error.update(zip(prep.view_IDs, errs)) - - self.error = error + if self.p.record_local_error: + local_error.update(zip(prep.view_IDs, errs)) + else: + reduced_error += errs.sum(axis=0) + reduced_error_count += errs.shape[0] + + if self.p.record_local_error: + error = local_error + else: + # Gather errors across all MPI ranks + error = parallel.allreduce(reduced_error) + count = parallel.allreduce(reduced_error_count) + error /= count return error def position_update(self): diff --git a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py index b236874a..56d65d6d 100644 --- a/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py +++ b/ptypy/accelerate/cuda_cupy/engines/projectional_cupy_stream.py @@ -158,7 +158,9 @@ def engine_iterate(self, num=1): for it in range(num): - error = {} + reduced_error = np.zeros((3,)) + reduced_error_count = 0 + local_error = {} for inner in range(self.p.overlap_max_iterations): @@ -403,20 +405,26 @@ def engine_iterate(self, num=1): cp.asnumpy(s.gpu, stream=self.queue, out=s.data) for name, s in self.pr.S.items(): cp.asnumpy(s.gpu, stream=self.queue, out=s.data) - - self.queue.synchronize() - # costly but needed to sync back with - # for name, s in self.ex.S.items(): - # s.data[:] = s.gpu.get() + # Gather errors from device for dID, prep in self.diff_info.items(): err_fourier = prep.err_fourier_gpu.get() err_phot = prep.err_phot_gpu.get() err_exit = prep.err_exit_gpu.get() errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) - error.update(zip(prep.view_IDs, errs)) - - self.error = error + if self.p.record_local_error: + local_error.update(zip(prep.view_IDs, errs)) + else: + reduced_error += errs.sum(axis=0) + reduced_error_count += errs.shape[0] + + if self.p.record_local_error: + error = local_error + else: + # Gather errors across all MPI ranks + error = parallel.allreduce(reduced_error) + count = parallel.allreduce(reduced_error_count) + error /= count return error # probe update diff --git a/ptypy/accelerate/cuda_cupy/engines/stochastic.py b/ptypy/accelerate/cuda_cupy/engines/stochastic.py index f798d569..6348fa88 100644 --- a/ptypy/accelerate/cuda_cupy/engines/stochastic.py +++ b/ptypy/accelerate/cuda_cupy/engines/stochastic.py @@ -227,9 +227,13 @@ def engine_iterate(self, num=1): Compute one iteration. """ self.dID_list = list(self.di.S.keys()) - error = {} + for it in range(num): + reduced_error = np.zeros((3,)) + reduced_error_count = 0 + local_error = {} + for iblock, dID in enumerate(self.dID_list): # find probe, object and exit ID in dependence of dID @@ -378,14 +382,24 @@ def engine_iterate(self, num=1): err_fourier = prep.err_fourier_gpu.get() err_phot = prep.err_phot_gpu.get() err_exit = prep.err_exit_gpu.get() - errs = np.ascontiguousarray( - np.vstack([err_fourier, err_phot, err_exit]).T) - error.update(zip(prep.view_IDs, errs)) + errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) + if self.p.record_local_error: + local_error.update(zip(prep.view_IDs, errs)) + else: + reduced_error += errs.sum(axis=0) + reduced_error_count += errs.shape[0] + + if self.p.record_local_error: + error = local_error + else: + # Gather errors across all MPI ranks + error = parallel.allreduce(reduced_error) + count = parallel.allreduce(reduced_error_count) + error /= count # wait for the async transfers self.qu_dtoh.synchronize() - self.error = error return error def position_update_local(self, prep, i): diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py index a10fceff..76c36212 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda.py @@ -208,7 +208,11 @@ def engine_iterate(self, num=1): queue = self.queue for it in range(num): - error = {} + + reduced_error = np.zeros((3,)) + reduced_error_count = 0 + local_error = {} + for dID in self.di.S.keys(): # find probe, object and exit ID in dependence of dID @@ -290,9 +294,19 @@ def engine_iterate(self, num=1): err_phot = prep.err_phot_gpu.get() err_exit = prep.err_exit_gpu.get() errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) - error.update(zip(prep.view_IDs, errs)) - - self.error = error + if self.p.record_local_error: + local_error.update(zip(prep.view_IDs, errs)) + else: + reduced_error += errs.sum(axis=0) + reduced_error_count += errs.shape[0] + + if self.p.record_local_error: + error = local_error + else: + # Gather errors across all MPI ranks + error = parallel.allreduce(reduced_error) + count = parallel.allreduce(reduced_error_count) + error /= count return error def position_update(self): diff --git a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py index 6c54a807..9c7c764e 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py +++ b/ptypy/accelerate/cuda_pycuda/engines/projectional_pycuda_stream.py @@ -150,7 +150,9 @@ def engine_iterate(self, num=1): for it in range(num): - error = {} + reduced_error = np.zeros((3,)) + reduced_error_count = 0 + local_error = {} for inner in range(self.p.overlap_max_iterations): @@ -387,17 +389,25 @@ def engine_iterate(self, num=1): for name, s in self.pr.S.items(): s.data[:] = s.gpu.get() - # costly but needed to sync back with - # for name, s in self.ex.S.items(): - # s.data[:] = s.gpu.get() + # Gather errors for dID, prep in self.diff_info.items(): err_fourier = prep.err_fourier_gpu.get() err_phot = prep.err_phot_gpu.get() err_exit = prep.err_exit_gpu.get() errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) - error.update(zip(prep.view_IDs, errs)) - - self.error = error + if self.p.record_local_error: + local_error.update(zip(prep.view_IDs, errs)) + else: + reduced_error += errs.sum(axis=0) + reduced_error_count += errs.shape[0] + + if self.p.record_local_error: + error = local_error + else: + # Gather errors across all MPI ranks + error = parallel.allreduce(reduced_error) + count = parallel.allreduce(reduced_error_count) + error /= count return error ## probe update diff --git a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py index d45a6721..dec8de24 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/stochastic.py +++ b/ptypy/accelerate/cuda_pycuda/engines/stochastic.py @@ -222,9 +222,13 @@ def engine_iterate(self, num=1): Compute one iteration. """ self.dID_list = list(self.di.S.keys()) - error = {} + for it in range(num): + reduced_error = np.zeros((3,)) + reduced_error_count = 0 + local_error = {} + for iblock, dID in enumerate(self.dID_list): # find probe, object and exit ID in dependence of dID @@ -357,12 +361,23 @@ def engine_iterate(self, num=1): err_phot = prep.err_phot_gpu.get() err_exit = prep.err_exit_gpu.get() errs = np.ascontiguousarray(np.vstack([err_fourier, err_phot, err_exit]).T) - error.update(zip(prep.view_IDs, errs)) + if self.p.record_local_error: + local_error.update(zip(prep.view_IDs, errs)) + else: + reduced_error += errs.sum(axis=0) + reduced_error_count += errs.shape[0] + + if self.p.record_local_error: + error = local_error + else: + # Gather errors across all MPI ranks + error = parallel.allreduce(reduced_error) + count = parallel.allreduce(reduced_error_count) + error /= count # wait for the async transfers self.qu_dtoh.synchronize() - self.error = error return error def position_update_local(self, prep, i): diff --git a/ptypy/engines/base.py b/ptypy/engines/base.py index 67426c0b..369056ab 100644 --- a/ptypy/engines/base.py +++ b/ptypy/engines/base.py @@ -262,11 +262,17 @@ def iterate(self, num=None): parallel.barrier() def _fill_runtime(self): - local_error = u.parallel.gather_dict(self.error) - if local_error: - error = np.array(list(local_error.values())).mean(0) + local_error = None + if isinstance(self.error, np.ndarray) and (len(self.error)== 3): + error = self.error + elif isinstance(self.error, dict): + local_error = u.parallel.gather_dict(self.error) + if local_error: + error = np.array(list(local_error.values())).mean(0) + else: + error = np.zeros((3,)) else: - error = np.zeros((1,)) + logger.error("Reconstruction error should be dictionary or ndarray of shape (3,)") info = dict( iteration=self.curiter, iterations=self.alliter, @@ -277,7 +283,7 @@ def _fill_runtime(self): ) self.ptycho.runtime.iter_info.append(info) - if self.p.record_local_error: + if self.p.record_local_error and (local_error is not None): self.ptycho.runtime.error_local = local_error def finalize(self): From 0bdbacd745b9520e1208fd09be5669a82862e8ca Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Wed, 6 Mar 2024 16:09:24 +0000 Subject: [PATCH 08/14] GitHub actions: pin conda version to 23.11.0 (#544) * Fix conda to 24.1.1 because of bug in 24.1.2 * whitespace * Need to roll back to 23.11.0 * seems like conda 23.11.0 needs python 3.11 --- .github/workflows/test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 98be46be..c3d61e1a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -38,6 +38,11 @@ jobs: # $CONDA is an environment variable pointing to the root of the miniconda directory echo $CONDA/bin >> $GITHUB_PATH conda --version + - name: Change conda version + run: | + # There is a bug in the most recent version (24.1.2) + conda install conda=23.11.0 python=3.11 + conda --version - name: Install dependencies run: | # replace python version in core dependencies From f2761dc70711e31a26210070eb67d24dc154c3c3 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Wed, 6 Mar 2024 16:14:27 +0000 Subject: [PATCH 09/14] Bugfix in batch_multiply kernel and more tests (#543) --- .../cuda_common/batched_multiply.cu | 2 +- .../cuda_cupy_tests/fft_scaling_test.py | 78 ++++++++++-- .../cuda_pycuda_tests/fft_scaling_test.py | 119 ++++++++++++++++-- 3 files changed, 175 insertions(+), 24 deletions(-) diff --git a/ptypy/accelerate/cuda_common/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu index f91bb6d3..11394f68 100644 --- a/ptypy/accelerate/cuda_common/batched_multiply.cu +++ b/ptypy/accelerate/cuda_common/batched_multiply.cu @@ -22,7 +22,7 @@ extern "C" __global__ void batched_multiply(const complex* input, int gy = threadIdx.y + blockIdx.y * blockDim.y; int gz = threadIdx.z + blockIdx.z * blockDim.z; - if (gx > columns || gy > rows || gz > nBatches) + if (gx > rows || gy > columns || gz > nBatches) return; auto val = input[gz * rows * columns + gy * rows + gx]; diff --git a/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py index 96cd1256..80cc2246 100644 --- a/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py +++ b/test/accelerate_tests/cuda_cupy_tests/fft_scaling_test.py @@ -39,16 +39,19 @@ def get_reverse_cuFFT(f, stream, class FftScalingTest(CupyCudaTest): - def get_input(self, size): + def get_input(self, size, squared=True): rows = cols = size + if not squared: + cols += 2 batches = 1 f = np.ones(shape=(batches, rows, cols), dtype=COMPLEX_TYPE) return f #### Trivial foward transform tests #### - def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True, size=32): - f = self.get_input(size) + def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True, + size=32, squared=True, decimal=6): + f = self.get_input(size, squared=squared) f_d = cp.asarray(f) if preffact is not None: pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64) @@ -70,8 +73,8 @@ def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=Tr elements = f.shape[-2] * f.shape[-1] scale = 1.0 if not symmetric else 1.0 / np.sqrt(elements) expected = elements * scale * preffact * postfact - self.assertAlmostEqual(f_back[0,0,0], expected) - np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=5) + np.testing.assert_almost_equal(f_back[0,0,0].real, expected, decimal=decimal) + np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=decimal) def test_fwd_noscale_cufft(self): self.fwd_test(False, get_forward_cuFFT) @@ -122,33 +125,58 @@ def test_prepostfilt_fwd_scale_cufft_cupy(self): self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False) def test_fwd_not_power_two_noscale_cufft_cupy(self): - self.fwd_test(False, get_forward_cuFFT, external=False, size=20) + self.fwd_test(False, get_forward_cuFFT, external=False, size=20, decimal=5) def test_fwd_not_power_two_scale_cufft_cupy(self): self.fwd_test(True, get_forward_cuFFT, external=False, size=20) def test_prefilt_fwd_not_power_two_noscale_cufft_cupy(self): - self.fwd_test(False, get_forward_cuFFT, preffact=2.0, external=False, size=20) + self.fwd_test(False, get_forward_cuFFT, preffact=2.0, external=False, size=20, decimal=5) def test_prefilt_fwd_not_power_two_scale_cufft_cupy(self): self.fwd_test(True, get_forward_cuFFT, preffact=2.0, external=False, size=20) def test_postfilt_fwd_not_power_two_noscale_cufft_cupy(self): - self.fwd_test(False, get_forward_cuFFT, postfact=2.0, external=False, size=20) + self.fwd_test(False, get_forward_cuFFT, postfact=2.0, external=False, size=20, decimal=5) def test_postfilt_fwd_not_power_two_scale_cufft_cupy(self): self.fwd_test(True, get_forward_cuFFT, postfact=2.0, external=False, size=20) def test_prepostfilt_not_power_two_fwd_noscale_cufft_cupy(self): - self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False, size=20) + self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False, size=20, decimal=5) def test_prepostfilt_not_power_two_fwd_scale_cufft_cupy(self): self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False, size=20) + + def test_fwd_not_power_two_not_squared_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, external=False, size=20, squared=False, decimal=5) + + def test_fwd_not_power_two_not_squared_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, external=False, size=20, squared=False) + + def test_prefilt_fwd_not_power_two_not_squared_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, preffact=2.0, external=False, size=20, squared=False, decimal=4) + + def test_prefilt_fwd_not_power_two_not_squared_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, preffact=2.0, external=False, size=20, squared=False) + + def test_postfilt_fwd_not_power_two_not_squared_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, postfact=2.0, external=False, size=20, squared=False, decimal=4) + + def test_postfilt_fwd_not_power_two_not_squared_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, postfact=2.0, external=False, size=20, squared=False) + + def test_prepostfilt_not_power_two_not_squared_fwd_noscale_cufft_cupy(self): + self.fwd_test(False, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False, size=20, squared=False, decimal=4) + + def test_prepostfilt_not_power_two_not_squared_fwd_scale_cufft_cupy(self): + self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False, size=20, squared=False, decimal=5) ############# Trivial inverse transform tests ######### - def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True, size=32): - f = self.get_input(size) + def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True, + size=32, squared=True, decimal=6): + f = self.get_input(size, squared=squared) f_d = cp.asarray(f) if preffact is not None: pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64) @@ -170,8 +198,8 @@ def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=Tr elements = f.shape[-2] * f.shape[-1] scale = 1.0 if not symmetric else np.sqrt(elements) expected = scale * preffact * postfact - self.assertAlmostEqual(f_back[0,0,0], expected) - np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=5) + np.testing.assert_almost_equal(f_back[0,0,0].real, expected, decimal=decimal) + np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=decimal) def test_rev_noscale_cufft(self): @@ -246,5 +274,29 @@ def test_prepostfilt_rev_not_power_two_noscale_cufft_cupy(self): def test_prepostfilt_rev_not_power_two_scale_cufft_cupy(self): self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False, size=20) + def test_rev_not_power_two_not_squared_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, external=False, size=20, squared=False) + + def test_rev_not_power_two_not_squared_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, external=False, size=20, squared=False) + + def test_prefilt_rev_not_power_two_not_squared_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, preffact=1.5, external=False, size=20, squared=False) + + def test_prefilt_rev_not_power_two_not_squared_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, preffact=1.5, external=False, size=20, squared=False, decimal=5) + + def test_postfilt_rev_not_power_two_not_squared_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, postfact=1.5, external=False, size=20, squared=False) + + def test_postfilt_rev_not_power_two_not_squared_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, postfact=1.5, external=False, size=20, squared=False) + + def test_prepostfilt_rev_not_power_not_squared_two_noscale_cufft_cupy(self): + self.rev_test(False, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False, size=20, squared=False) + + def test_prepostfilt_rev_not_power_not_squared_two_scale_cufft_cupy(self): + self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False, size=20, squared=False) + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py index b16a9290..18a02ef1 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_scaling_test.py @@ -52,16 +52,19 @@ def get_reverse_Reikna(f, stream, class FftScalingTest(PyCudaTest): - def get_input(self): - rows = cols = 32 + def get_input(self, size, squared=True): + rows = cols = size + if not squared: + cols += 2 batches = 1 f = np.ones(shape=(batches, rows, cols), dtype=COMPLEX_TYPE) return f #### Trivial foward transform tests #### - def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True): - f = self.get_input() + def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=True, + size=32, squared=True, decimal=6): + f = self.get_input(size, squared=squared) f_d = gpuarray.to_gpu(f) if preffact is not None: pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64) @@ -83,8 +86,8 @@ def fwd_test(self, symmetric, factory, preffact=None, postfact=None, external=Tr elements = f.shape[-2] * f.shape[-1] scale = 1.0 if not symmetric else 1.0 / np.sqrt(elements) expected = elements * scale * preffact * postfact - self.assertAlmostEqual(f_back[0,0,0], expected) - np.testing.assert_array_almost_equal(f_back.flat[1:], 0) + np.testing.assert_almost_equal(f_back[0,0,0].real, expected, decimal=decimal) + np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=decimal) def test_fwd_noscale_reikna(self): self.fwd_test(False, get_forward_Reikna) @@ -166,11 +169,59 @@ def test_prepostfilt_fwd_scale_cufft(self): def test_prepostfilt_fwd_scale_cufft_skcuda(self): self.fwd_test(True, get_forward_cuFFT, postfact=2.0, preffact=1.5, external=False) + def test_fwd_not_power_two_noscale_reikna(self): + self.fwd_test(False, get_forward_Reikna, size=20, decimal=4) + + def test_fwd_not_power_two_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, size=20, decimal=4) + + def test_prefilt_fwd_not_power_two_noscale_reikna(self): + self.fwd_test(False, get_forward_Reikna, preffact=2.0, size=20, decimal=4) + + def test_prefilt_fwd_not_power_two_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, preffact=2.0, size=20, decimal=4) + + def test_postfilt_fwd_not_power_two_noscale_reikna(self): + self.fwd_test(False, get_forward_Reikna, postfact=2.0, size=20, decimal=4) + + def test_postfilt_fwd_not_power_two_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, postfact=2.0, size=20, decimal=4) + + def test_prepostfilt_fwd_not_power_two_noscale_reikna(self): + self.fwd_test(False, get_forward_Reikna, postfact=2.0, preffact=1.5, size=20, decimal=4) + + def test_prepostfilt_fwd_not_power_two_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, postfact=2.0, preffact=1.5, size=20, decimal=4) + + def test_fwd_not_power_two_noscale_not_squared_reikna(self): + self.fwd_test(False, get_forward_Reikna, size=20, squared=False, decimal=4) + + def test_fwd_not_power_two_not_squared_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, size=20, squared=False, decimal=4) + + def test_prefilt_fwd_not_power_two_not_squared_noscale_reikna(self): + self.fwd_test(False, get_forward_Reikna, preffact=2.0, size=20, squared=False, decimal=4) + + def test_prefilt_fwd_not_power_two_not_squared_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, preffact=2.0, size=20, squared=False, decimal=4) + + def test_postfilt_fwd_not_power_two_not_squared_noscale_reikna(self): + self.fwd_test(False, get_forward_Reikna, postfact=2.0, size=20, squared=False, decimal=4) + + def test_postfilt_fwd_not_power_two_not_squared_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, postfact=2.0, size=20, squared=False, decimal=4) + + def test_prepostfilt_fwd_not_power_two_not_squared_noscale_reikna(self): + self.fwd_test(False, get_forward_Reikna, postfact=2.0, preffact=1.5, size=20, squared=False, decimal=4) + + def test_prepostfilt_fwd_not_power_two_not_squared_scale_reikna(self): + self.fwd_test(True, get_forward_Reikna, postfact=2.0, preffact=1.5, size=20, squared=False, decimal=4) ############# Trivial inverse transform tests ######### - def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True): - f = self.get_input() + def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=True, + size=32, squared=True, decimal=6): + f = self.get_input(size, squared=squared) f_d = gpuarray.to_gpu(f) if preffact is not None: pref = preffact * np.ones(shape=f.shape[-2:], dtype=np.complex64) @@ -192,8 +243,8 @@ def rev_test(self, symmetric, factory, preffact=None, postfact=None, external=Tr elements = f.shape[-2] * f.shape[-1] scale = 1.0 if not symmetric else np.sqrt(elements) expected = scale * preffact * postfact - self.assertAlmostEqual(f_back[0,0,0], expected) - np.testing.assert_array_almost_equal(f_back.flat[1:], 0) + np.testing.assert_almost_equal(f_back[0,0,0].real, expected, decimal=decimal) + np.testing.assert_array_almost_equal(f_back.flat[1:], 0, decimal=decimal) def test_rev_noscale_reikna(self): @@ -276,6 +327,54 @@ def test_prepostfilt_rev_scale_cufft(self): def test_prepostfilt_rev_scale_cufft_skcuda(self): self.rev_test(True, get_reverse_cuFFT, postfact=1.5, preffact=2.0, external=False) + def test_rev_not_power_two_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, size=20) + + def test_rev_not_power_two_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, size=20, decimal=5) + + def test_prefilt_rev_not_power_two_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, preffact=1.5, size=20) + + def test_prefilt_rev_not_power_two_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, preffact=1.5, size=20, decimal=5) + + def test_postfilt_rev_not_power_two_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, postfact=1.5, size=20) + + def test_postfilt_rev_not_power_two_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, postfact=1.5, size=20, decimal=5) + + def test_prepostfilt_rev_not_power_two_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, postfact=1.5, preffact=2.0, size=20) + + def test_prepostfilt_rev_not_power_two_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, postfact=1.5, preffact=2.0, size=20, decimal=5) + + def test_rev_not_power_two_not_squared_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, size=20, squared=False) + + def test_rev_not_power_two_not_squared_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, size=20, squared=False, decimal=5) + + def test_prefilt_rev_not_power_two_not_squared_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, preffact=1.5, size=20, squared=False) + + def test_prefilt_rev_not_power_two_not_squared_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, preffact=1.5, size=20, squared=False, decimal=5) + + def test_postfilt_rev_not_power_two_not_squared_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, postfact=1.5, size=20, squared=False) + + def test_postfilt_rev_not_power_two_not_squared_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, postfact=1.5, size=20, squared=False, decimal=5) + + def test_prepostfilt_rev_not_power_two_not_squared_noscale_reikna(self): + self.rev_test(False, get_reverse_Reikna, postfact=1.5, preffact=2.0, size=20, squared=False) + + def test_prepostfilt_rev_not_power_two_not_squared_scale_reikna(self): + self.rev_test(True, get_reverse_Reikna, postfact=1.5, preffact=2.0, size=20, squared=False, decimal=5) + if __name__ == '__main__': unittest.main() From 02e17a414d6a7d0f1ca978cfb09ee1aea1233a2b Mon Sep 17 00:00:00 2001 From: Jari Date: Fri, 8 Mar 2024 10:19:06 +0000 Subject: [PATCH 10/14] Option to use full polynomial in ML linesearch (#487) * Option to use full polynomial in ML linesearch * Tidy and add full polynomial for Euclid model * code restructure, less switching * change new parameter from boolean to str --------- Co-authored-by: Benedikt Daurer --- ptypy/engines/ML.py | 323 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 311 insertions(+), 12 deletions(-) diff --git a/ptypy/engines/ML.py b/ptypy/engines/ML.py index e7492b42..9d5bb7d1 100644 --- a/ptypy/engines/ML.py +++ b/ptypy/engines/ML.py @@ -100,6 +100,12 @@ class ML(PositionCorrectionEngine): lowlim = 0 help = Number of iterations before probe update starts + [poly_line_coeffs] + default = quadratic + type = str + help = How many coefficients to be used in the the linesearch + doc = choose between the 'quadratic' approximation (default) or 'all' + """ SUPPORTED_MODELS = [Full, Vanilla, Bragg3dModel, BlockVanilla, BlockFull, GradFull, BlockGradFull] @@ -287,16 +293,25 @@ def engine_iterate(self, num=1): # In principle, the way things are now programmed this part # could be iterated over in a real Newton-Raphson style. t2 = time.time() - B = self.ML_model.poly_line_coeffs(self.ob_h, self.pr_h) + if self.p.poly_line_coeffs == "all": + B = self.ML_model.poly_line_all_coeffs(self.ob_h, self.pr_h) + diffB = np.arange(1,len(B))*B[1:] # coefficients of poly derivative + roots = np.roots(np.flip(diffB.astype(np.double))) # roots only supports double + real_roots = np.real(roots[np.isreal(roots)]) # not interested in complex roots + if real_roots.size == 1: # single real root + self.tmin = dt(real_roots[0]) + else: # find real root with smallest poly objective + evalp = lambda root: np.polyval(np.flip(B),root) + self.tmin = dt(min(real_roots, key=evalp)) # root with smallest poly objective + elif self.p.poly_line_coeffs == "quadratic": + B = self.ML_model.poly_line_coeffs(self.ob_h, self.pr_h) + # same as above but quicker when poly quadratic + self.tmin = dt(-0.5 * B[1] / B[2]) + else: + raise NotImplementedError("poly_line_coeffs should be 'quadratic' or 'all'") + tc += time.time() - t2 - - if np.isinf(B).any() or np.isnan(B).any(): - logger.warning( - 'Warning! inf or nan found! Trying to continue...') - B[np.isinf(B)] = 0. - B[np.isnan(B)] = 0. - - self.tmin = dt(-.5 * B[1] / B[2]) + self.ob_h *= self.tmin self.pr_h *= self.tmin self.ob += self.ob_h @@ -427,6 +442,13 @@ def poly_line_coeffs(self, ob_h, pr_h): """ raise NotImplementedError + def poly_line_all_coeffs(self, ob_h, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h + """ + raise NotImplementedError + class GaussianModel(BaseModel): """ @@ -578,7 +600,7 @@ def poly_line_coeffs(self, ob_h, pr_h): w = pod.upsample(w) B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm - B[1] += np.dot(w.flat, (2 * A0 * A1).flat) * Brenorm + B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm parallel.allreduce(B) @@ -589,10 +611,101 @@ def poly_line_coeffs(self, ob_h, pr_h): B += Brenorm * self.regularizer.poly_line_coeffs( ob_h.storages[name].data, s.data) + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + self.B = B return B + def poly_line_all_coeffs(self, ob_h, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h + """ + + B = np.zeros((9,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + w = self.weights[diff_view] + I = diff_view.data + + A0 = None + A1 = None + A2 = None + A3 = None + A4 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view] + + pr_h[pod.pr_view] * pod.object) + b = pod.fw(pr_h[pod.pr_view] * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = (2 * np.real(f * b.conj()).astype(np.longdouble) + + u.abs2(a).astype(np.longdouble)) + A3 = 2 * np.real(a * b.conj()).astype(np.longdouble) + A4 = u.abs2(b).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += 2 * np.real(f * b.conj()) + u.abs2(a) + A3 += 2 * np.real(a * b.conj()) + A4 += u.abs2(b) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + A3 *= self.float_intens_coeff[dname] + A4 *= self.float_intens_coeff[dname] + + A0 = np.double(A0) - pod.upsample(I) + #A0 -= pod.upsample(I) + w = pod.upsample(w) + + B[0] += np.dot(w.flat, (A0**2).flat) * Brenorm + B[1] += np.dot(w.flat, (2*A0*A1).flat) * Brenorm + B[2] += np.dot(w.flat, (A1**2 + 2*A0*A2).flat) * Brenorm + B[3] += np.dot(w.flat, (2*A0*A3 + 2*A1*A2).flat) * Brenorm + B[4] += np.dot(w.flat, (A2**2 + 2*A1*A3 + 2*A0*A4).flat) * Brenorm + B[5] += np.dot(w.flat, (2*A1*A4 + 2*A2*A3).flat) * Brenorm + B[6] += np.dot(w.flat, (A3**2 + 2*A2*A4).flat) * Brenorm + B[7] += np.dot(w.flat, (2*A3*A4).flat) * Brenorm + B[8] += np.dot(w.flat, (A4**2).flat) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B[:3] += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + class PoissonModel(BaseModel): """ @@ -730,7 +843,7 @@ def poly_line_coeffs(self, ob_h, pr_h): B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm - B[2] += (np.dot(m.flat, (A2*DI).flat) + .5*np.dot(m.flat, (I*(A1/A0)**2.).flat)) * Brenorm + B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm parallel.allreduce(B) @@ -740,6 +853,96 @@ def poly_line_coeffs(self, ob_h, pr_h): B += Brenorm * self.regularizer.poly_line_coeffs( ob_h.storages[name].data, s.data) + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_all_coeffs(self, ob_h, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h + """ + B = np.zeros((9,), dtype=np.longdouble) + Brenorm = 1/(self.tot_measpts * self.LL[0])**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and intensities for this view + I = diff_view.data + m = diff_view.pod.ma_view.data + + A0 = None + A1 = None + A2 = None + A3 = None + A4 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view] + + pr_h[pod.pr_view] * pod.object) + b = pod.fw(pr_h[pod.pr_view] * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = (2 * np.real(f * b.conj()).astype(np.longdouble) + + u.abs2(a).astype(np.longdouble)) + A3 = 2 * np.real(a * b.conj()).astype(np.longdouble) + A4 = u.abs2(b).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += 2 * np.real(f * b.conj()) + u.abs2(a) + A3 += 2 * np.real(a * b.conj()) + A4 += u.abs2(b) + + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + A3 *= self.float_intens_coeff[dname] + A4 *= self.float_intens_coeff[dname] + + A0 += 1e-6 + DI = 1. - I/A0 + + B[0] += (self.LLbase[dname] + (m * (A0 - I * np.log(A0))).sum().astype(np.float64)) * Brenorm + B[1] += np.dot(m.flat, (A1*DI).flat) * Brenorm + B[2] += (np.dot(m.flat, (A2*DI).flat) + 0.5*np.dot(m.flat, (I*(A1/A0)**2).flat)) * Brenorm + B[3] += (np.dot(m.flat, (A3*DI).flat) + 0.5*np.dot(m.flat, (I*((2*A1*A2)/A0**2)).flat)) * Brenorm + B[4] += (np.dot(m.flat, (A4*DI).flat) + 0.5*np.dot(m.flat, (I*((A2**2 + 2*A1*A3)/A0**2)).flat)) * Brenorm + B[5] += 0.5*np.dot(m.flat, (I*((2*A1*A4 + 2*A2*A3)/A0**2)).flat) * Brenorm + B[6] += 0.5*np.dot(m.flat, (I*((A3**2 + 2*A2*A4)/A0**2)).flat) * Brenorm + B[7] += 0.5*np.dot(m.flat, (I*((2*A3*A4)/A0**2)).flat) * Brenorm + B[8] += 0.5*np.dot(m.flat, (I*(A4/A0)**2).flat) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B[:3] += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + self.B = B return B @@ -896,7 +1099,7 @@ def poly_line_coeffs(self, ob_h, pr_h): B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm - B[2] += (np.dot(w.flat, (A2*DA).flat) + .25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm + B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * A/A0**(3/2)).flat)) * Brenorm parallel.allreduce(B) @@ -906,6 +1109,102 @@ def poly_line_coeffs(self, ob_h, pr_h): B += Brenorm * self.regularizer.poly_line_coeffs( ob_h.storages[name].data, s.data) + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + + self.B = B + + return B + + def poly_line_all_coeffs(self, ob_h, pr_h): + """ + Compute all the coefficients of the polynomial for line minimization + in direction h + """ + + B = np.zeros((9,), dtype=np.longdouble) + Brenorm = 1. / self.LL[0]**2 + + # Outer loop: through diffraction patterns + for dname, diff_view in self.di.views.items(): + if not diff_view.active: + continue + + # Weights and amplitudes for this view + w = self.weights[diff_view] + A = np.sqrt(diff_view.data) + + A0 = None + A1 = None + A2 = None + A3 = None + A4 = None + + for name, pod in diff_view.pods.items(): + if not pod.active: + continue + f = pod.fw(pod.probe * pod.object) + a = pod.fw(pod.probe * ob_h[pod.ob_view] + + pr_h[pod.pr_view] * pod.object) + b = pod.fw(pr_h[pod.pr_view] * ob_h[pod.ob_view]) + + if A0 is None: + A0 = u.abs2(f).astype(np.longdouble) + A1 = 2 * np.real(f * a.conj()).astype(np.longdouble) + A2 = (2 * np.real(f * b.conj()).astype(np.longdouble) + + u.abs2(a).astype(np.longdouble)) + A3 = 2 * np.real(a * b.conj()).astype(np.longdouble) + A4 = u.abs2(b).astype(np.longdouble) + else: + A0 += u.abs2(f) + A1 += 2 * np.real(f * a.conj()) + A2 += 2 * np.real(f * b.conj()) + u.abs2(a) + A3 += 2 * np.real(a * b.conj()) + A4 += u.abs2(b) + + if self.p.floating_intensities: + A0 *= self.float_intens_coeff[dname] + A1 *= self.float_intens_coeff[dname] + A2 *= self.float_intens_coeff[dname] + A3 *= self.float_intens_coeff[dname] + A4 *= self.float_intens_coeff[dname] + + A0 += 1e-12 # cf Poisson model sqrt(1e-12) = 1e-6 + DA = 1. - A/np.sqrt(A0) + DA32 = A/A0**(3/2) + + B[0] += np.dot(w.flat, ((np.sqrt(A0) - A)**2).flat) * Brenorm + B[1] += np.dot(w.flat, (A1*DA).flat) * Brenorm + B[2] += (np.dot(w.flat, (A2*DA).flat) + 0.25*np.dot(w.flat, (A1**2 * DA32).flat)) * Brenorm + B[3] += (np.dot(w.flat, (A3*DA).flat) + 0.25*np.dot(w.flat, (2*A1*A2 * DA32).flat) - 0.125*np.dot(w.flat, (A1**3/A0**2).flat)) * Brenorm + B[4] += (np.dot(w.flat, (A4*DA).flat) + 0.25*np.dot(w.flat, ((A2**2 + 2*A1*A3) * DA32).flat) - 0.125*np.dot(w.flat, ((3*A1**2*A2)/A0**2).flat) + + 0.015625*np.dot(w.flat, (A1**4/A0**3).flat)) * Brenorm + B[5] += (0.25*np.dot(w.flat, ((2*A2*A3 + 2*A1*A4) * DA32).flat) - 0.125*np.dot(w.flat, ((3*A1*A2**2 + 3*A1**2*A3)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((4*A1**3*A2)/A0**3).flat)) * Brenorm + B[6] += (0.25*np.dot(w.flat, ((A3**2 + 2*A2*A4) * DA32).flat) - 0.125*np.dot(w.flat, ((A2**3 + 3*A1**2*A4 + 6*A1*A2*A3)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((6*A1**2*A2**2 + 4*A1**3*A3)/A0**3).flat)) * Brenorm + B[7] += (0.25*np.dot(w.flat, (2*A3*A4 * DA32).flat) - 0.125*np.dot(w.flat, ((3*A2**2*A3 + 3*A1*A3**2 + 6*A1*A2*A4)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((4*A1*A2**3 + 12*A1**2*A2*A3 + 4*A1**3*A4)/A0**3).flat)) * Brenorm + B[8] += (0.25*np.dot(w.flat, (A4**2 * DA32).flat) - 0.125*np.dot(w.flat, ((3*A2*A3**2 + 3*A2**2*A4 + 6*A1*A3*A4)/A0**2).flat) + + 0.015625*np.dot(w.flat, ((A2**4 + 12*A1*A2**2*A3 + 6*A1**2*A3**2 + 12*A1**2*A2*A4)/A0**3).flat)) * Brenorm + + parallel.allreduce(B) + + # Object regularizer + if self.regularizer: + for name, s in self.ob.storages.items(): + B[:3] += Brenorm * self.regularizer.poly_line_coeffs( + ob_h.storages[name].data, s.data) + + if np.isinf(B).any() or np.isnan(B).any(): + logger.warning( + 'Warning! inf or nan found! Trying to continue...') + B[np.isinf(B)] = 0. + B[np.isnan(B)] = 0. + self.B = B return B From 08e6b2d91f2b61f93f0477be1c5f30ec89a1934f Mon Sep 17 00:00:00 2001 From: Pierre Thibault Date: Fri, 8 Mar 2024 17:50:09 +0000 Subject: [PATCH 11/14] WIP GPU implementation of FFT-based smoothing in ML (#504) * WIP kernel implementation of general FFT filter * Tests for FFT filter * FFT-based gaussian smoothing in ML * Add dummy _get_smooth_gradient_fft in ML_serial * added batched fft_filter tests * implented numpy based gaussian fft filter and added more tests * Introduced new parameter for changing method for smoothing kernel * Fixed smooth gradient method parameter * add example for FFT smoothing in ML pycuda * Added new template * Working on FFT based Gaussian filter for cupy engines * Fixed another bug in batch multiply kernel * fft based Gaussian smoothing works with both Ml_pycuda and ML_cupy * small changes to accelerate tests * removed debugging traces * Improve error message when convolution kernel too big --------- Co-authored-by: Timothy Poon Co-authored-by: Benedikt Daurer --- ptypy/accelerate/base/array_utils.py | 50 +++++- ptypy/accelerate/base/engines/ML_serial.py | 25 ++- .../cuda_common/batched_multiply.cu | 3 +- ptypy/accelerate/cuda_cupy/array_utils.py | 88 +++++++++++ ptypy/accelerate/cuda_cupy/engines/ML_cupy.py | 25 ++- ptypy/accelerate/cuda_cupy/kernels.py | 28 ++++ ptypy/accelerate/cuda_pycuda/array_utils.py | 89 +++++++++++ .../cuda_pycuda/engines/ML_pycuda.py | 25 ++- ptypy/accelerate/cuda_pycuda/fft.py | 4 +- ptypy/accelerate/cuda_pycuda/kernels.py | 28 ++++ .../misc/moonflower_ML_cupy_fft_smoothing.py | 64 ++++++++ .../moonflower_ML_pycuda_fft_smoothing.py | 64 ++++++++ .../base_tests/array_utils_test.py | 148 ++++++++++++++++++ .../cuda_cupy_tests/array_utils_test.py | 141 +++++++++++++++++ .../cuda_pycuda_tests/array_utils_test.py | 143 +++++++++++++++++ .../cuda_pycuda_tests/fft_setstream_test.py | 1 + 16 files changed, 906 insertions(+), 20 deletions(-) create mode 100644 templates/misc/moonflower_ML_cupy_fft_smoothing.py create mode 100644 templates/misc/moonflower_ML_pycuda_fft_smoothing.py diff --git a/ptypy/accelerate/base/array_utils.py b/ptypy/accelerate/base/array_utils.py index b94a056f..bec4cc8e 100644 --- a/ptypy/accelerate/base/array_utils.py +++ b/ptypy/accelerate/base/array_utils.py @@ -56,6 +56,16 @@ def norm2(input): return np.sum(abs2(input)) +def gaussian_kernel_2d(shape, sigmau, sigmav): + """ + 2D Gaussian kernel using the last 2 dimension of given shape + Requires sigma for both dimensions (sigmau and sigmav) + """ + u, v = np.fft.fftfreq(shape[-2]), np.fft.fftfreq(shape[-1]) + uu, vv = np.meshgrid(u, v, sparse=True, indexing='ij') + kernel = np.exp(-2* ( (np.pi*sigmau)**2 * uu**2 + (np.pi*sigmav)**2 * vv**2 ) ) + return kernel + def complex_gaussian_filter(input, mfs): ''' takes 2D and 3D arrays. Complex input, complex output. mfs has len 0 2: + raise NotImplementedError("Only batches of 2D arrays allowed!") + elif len(mfs) == 1: + mfs = np.array([mfs,mfs]) + else: + mfs = np.array(mfs) + + k = gaussian_kernel_2d(input.shape, mfs[0], mfs[1]).astype(input.dtype) + return fft_filter(input, k) + + +def fft_filter(input, kernel, prefactor=None, postfactor=None, forward=True): + """ + Compute + output = ifft(fft( prefactor * input ) * kernel) * postfactor + """ + # Make a copy (and cast if necessary) + x = np.array(input) + + + if prefactor is not None: + x *= prefactor + + if forward: + x = np.fft.ifftn(np.fft.fftn(x, norm="ortho") * kernel, norm="ortho") + else: + x = np.fft.fftn(np.fft.ifftn(x, norm="ortho") * kernel, norm="ortho") + + if postfactor is not None: + x *= postfactor + + return x + + def mass_center(A): ''' Input will always be real, and 2d or 3d, single precision here @@ -81,7 +129,7 @@ def interpolated_shift(c, shift, do_linear=False): ''' complex bicubic interpolated shift. complex output. This shift should be applied to 2D arrays. shift should have len=c.ndims - + ''' if not do_linear: return ndi.shift(np.real(c), shift, order=3, prefilter=True) + 1j * ndi.shift( diff --git a/ptypy/accelerate/base/engines/ML_serial.py b/ptypy/accelerate/base/engines/ML_serial.py index 38f63f38..1e953d26 100644 --- a/ptypy/accelerate/base/engines/ML_serial.py +++ b/ptypy/accelerate/base/engines/ML_serial.py @@ -23,6 +23,7 @@ from ptypy.engines import register from ptypy.accelerate.base.kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel from ptypy.accelerate.base import address_manglers +from ptypy.accelerate.base.array_utils import complex_gaussian_filter, complex_gaussian_filter_fft __all__ = ['ML_serial'] @@ -30,6 +31,15 @@ @register() class ML_serial(ML): + """ + Defaults: + + [smooth_gradient_method] + default = convolution + type = str + help = Method to be used for smoothing the gradient, choose between ```convolution``` or ```fft```. + """ + def __init__(self, ptycho_parent, pars=None): """ Maximum likelihood reconstruction engine. @@ -143,7 +153,12 @@ def engine_prepare(self): self.ML_model.prepare() def _get_smooth_gradient(self, data, sigma): - return self.smooth_gradient(data) + if self.p.smooth_gradient_method == "convolution": + return complex_gaussian_filter(data, sigma) + elif self.p.smooth_gradient_method == "fft": + return complex_gaussian_filter_fft(data, sigma) + else: + raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.") def _replace_ob_grad(self): new_ob_grad = self.ob_grad_new @@ -272,7 +287,7 @@ def engine_iterate(self, num=1): return error_dct # np.array([[self.ML_model.LL[0]] * 3]) def position_update(self): - """ + """ Position refinement """ if not self.do_position_refinement: @@ -283,7 +298,7 @@ def position_update(self): # Update positions if do_update_pos: """ - Iterates through all positions and refines them by a given algorithm. + Iterates through all positions and refines them by a given algorithm. """ log(4, "----------- START POS REF -------------") for dID in self.di.S.keys(): @@ -308,7 +323,7 @@ def position_update(self): max_oby = ob.shape[-2] - aux.shape[-2] - 1 max_obx = ob.shape[-1] - aux.shape[-1] - 1 - # We need to re-calculate the current error + # We need to re-calculate the current error PCK.build_aux(aux, addr, ob, pr) aux[:] = FW(aux) PCK.log_likelihood_ml(aux, addr, I, w, err_phot) @@ -338,7 +353,7 @@ def engine_finalize(self): for i,view in enumerate(d.views): for j,(pname, pod) in enumerate(view.pods.items()): delta = (prep.addr[i][j][1][1:] - prep.original_addr[i][j][1][1:]) * res - pod.ob_view.coord += delta + pod.ob_view.coord += delta pod.ob_view.storage.update_views(pod.ob_view) self.ptycho.record_positions = True diff --git a/ptypy/accelerate/cuda_common/batched_multiply.cu b/ptypy/accelerate/cuda_common/batched_multiply.cu index 11394f68..e5494027 100644 --- a/ptypy/accelerate/cuda_common/batched_multiply.cu +++ b/ptypy/accelerate/cuda_common/batched_multiply.cu @@ -22,10 +22,11 @@ extern "C" __global__ void batched_multiply(const complex* input, int gy = threadIdx.y + blockIdx.y * blockDim.y; int gz = threadIdx.z + blockIdx.z * blockDim.z; - if (gx > rows || gy > columns || gz > nBatches) + if (gx > rows - 1 || gy > columns - 1 || gz > nBatches) return; auto val = input[gz * rows * columns + gy * rows + gx]; + if (MPY_DO_FILT) // set at compile-time { val *= filter[gy * rows + gx]; diff --git a/ptypy/accelerate/cuda_cupy/array_utils.py b/ptypy/accelerate/cuda_cupy/array_utils.py index 9c68d943..626aa985 100644 --- a/ptypy/accelerate/cuda_cupy/array_utils.py +++ b/ptypy/accelerate/cuda_cupy/array_utils.py @@ -2,6 +2,7 @@ import numpy as np from ptypy.accelerate.cuda_common.utils import map2ctype +from ptypy.accelerate.base.array_utils import gaussian_kernel_2d from ptypy.utils.math_utils import gaussian from . import load_kernel @@ -62,6 +63,37 @@ def dot(self, A: cp.ndarray, B: cp.ndarray, out: cp.ndarray = None) -> cp.ndarra def norm2(self, A, out=None): return self.dot(A, A, out) +class BatchedMultiplyKernel: + def __init__(self, array, queue=None, math_type=np.complex64): + self.queue = queue + self.array_shape = array.shape[-2:] + self.batches = int(np.prod(array.shape[0:array.ndim-2]) if array.ndim > 2 else 1) + self.batched_multiply_cuda = load_kernel("batched_multiply", { + 'MPY_DO_SCALE': 'true', + 'MPY_DO_FILT': 'true', + 'IN_TYPE': 'float' if array.dtype==np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype==np.complex64 else 'double', + 'MATH_TYPE': 'float' if math_type==np.complex64 else 'double' + }) + self.block = (32,32,1) + self.grid = ( + int((self.array_shape[0] + 31) // 32), + int((self.array_shape[1] + 31) // 32), + int(self.batches) + ) + + def multiply(self, x,y, scale=1.): + assert x.dtype == y.dtype, "Input arrays must be of same data type" + assert x.shape[-2:] == y.shape[-2:], "Input arrays must be of the same size in last 2 dims" + if self.queue is not None: + self.queue.use() + self.batched_multiply_cuda(self.grid, + self.block, + args=(x,x,y, + np.float32(scale), + np.int32(self.batches), + np.int32(self.array_shape[0]), + np.int32(self.array_shape[1]))) class TransposeKernel: @@ -322,6 +354,62 @@ def delxb(self, input, out, axis=-1): out, lower_dim, higher_dim, np.int32(input.shape[axis]))) +class FFTGaussianSmoothingKernel: + def __init__(self, queue=None, kernel_type='float'): + if kernel_type not in ['float', 'double']: + raise ValueError('Invalid data type for kernel') + self.kernel_type = kernel_type + self.dtype = np.complex64 + self.stype = "complex" + self.queue = queue + self.sigma = None + + from .kernels import FFTFilterKernel + + # Create general FFT filter object + self.fft_filter = FFTFilterKernel(queue_thread=queue) + + def allocate(self, shape, sigma=1.): + + # Create kernel + self.sigma = sigma + kernel = self._compute_kernel(shape, sigma) + + # Allocate filter + kernel_dev = cp.asarray(kernel) + self.fft_filter.allocate(kernel=kernel_dev) + + def _compute_kernel(self, shape, sigma): + # Create kernel + sigma = np.array(sigma) + if sigma.size == 1: + sigma = np.array([sigma, sigma]) + if sigma.size > 2: + raise NotImplementedError("Only batches of 2D arrays allowed!") + self.sigma = sigma + + kernel = gaussian_kernel_2d(shape, sigma[0], sigma[1]).astype(self.dtype) + + # Extend kernel if needed + if len(shape) == 3: + kernel = np.tile(kernel, (shape[0],1,1)) + + return kernel + + def filter(self, data, sigma=None): + """ + Apply filter in-place + + If sigma is not None: reallocate a new fft filter first. + """ + if self.sigma is None: + self.allocate(shape=data.shape, sigma=sigma) + else: + self.fft_filter.set_kernel(self._compute_kernel(data.shape, sigma)) + + self.fft_filter.apply_filter(data) + + class GaussianSmoothingKernel: def __init__(self, queue=None, num_stdevs=4, kernel_type='float'): if kernel_type not in ['float', 'double']: diff --git a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py index efcc4233..caa0a192 100644 --- a/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py +++ b/ptypy/accelerate/cuda_cupy/engines/ML_cupy.py @@ -23,7 +23,7 @@ from .. import get_context, log_device_memory_stats from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel -from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel +from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, FFTGaussianSmoothingKernel, TransposeKernel from ..mem_utils import GpuDataManager #from ..mem_utils import GpuDataManager @@ -79,8 +79,12 @@ def engine_initialize(self): self.qu_htod = cp.cuda.Stream() self.qu_dtoh = cp.cuda.Stream() - self.GSK = GaussianSmoothingKernel(queue=self.queue) - self.GSK.tmp = None + if self.p.smooth_gradient_method == "convolution": + self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None + + if self.p.smooth_gradient_method == "fft": + self.FGSK = FFTGaussianSmoothingKernel(queue=self.queue) # Real/Fourier Support Kernel self.RSK = {} @@ -260,9 +264,18 @@ def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): dev=dev, container=container, sync_copy=sync_copy) def _get_smooth_gradient(self, data, sigma): - if self.GSK.tmp is None: - self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64) - self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + if self.p.smooth_gradient_method == "convolution": + if self.GSK.tmp is None: + self.GSK.tmp = cp.empty(data.shape, dtype=np.complex64) + try: + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + except MemoryError: + raise RuntimeError("Convolution kernel too large for direct convolution on GPU", + "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.") + elif self.p.smooth_gradient_method == "fft": + self.FGSK.filter(data, sigma) + else: + raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.") return data def _replace_ob_grad(self): diff --git a/ptypy/accelerate/cuda_cupy/kernels.py b/ptypy/accelerate/cuda_cupy/kernels.py index 049108e7..b743e9b0 100644 --- a/ptypy/accelerate/cuda_cupy/kernels.py +++ b/ptypy/accelerate/cuda_cupy/kernels.py @@ -170,6 +170,34 @@ def apply_real_support(self, x): x *= self.support +class FFTFilterKernel: + def __init__(self, queue_thread=None, fft='cupy'): + # Current implementation recompiles every time there is a change in input shape. + self.queue = queue_thread + self._fft_type = fft + self._fft1 = None + self._fft2 = None + def allocate(self, kernel, prefactor=None, postfactor=None, forward=True): + FFT = choose_fft(kernel.shape[-2:], fft_type=self._fft_type) + + self._fft1 = FFT(kernel, self.queue, + pre_fft=prefactor, + post_fft=kernel, + symmetric=True, + forward=forward) + self._fft2 = FFT(kernel, self.queue, + post_fft=postfactor, + symmetric=True, + forward=not forward) + + def set_kernel(self, kernel): + self._fft1.post_fft.set(kernel) + + def apply_filter(self, x): + self._fft1.ft(x, x) + self._fft2.ift(x, x) + + class FourierUpdateKernel(ab.FourierUpdateKernel): def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'): diff --git a/ptypy/accelerate/cuda_pycuda/array_utils.py b/ptypy/accelerate/cuda_pycuda/array_utils.py index 72eae996..7e592228 100644 --- a/ptypy/accelerate/cuda_pycuda/array_utils.py +++ b/ptypy/accelerate/cuda_pycuda/array_utils.py @@ -1,4 +1,5 @@ from ptypy.accelerate.cuda_common.utils import map2ctype +from ptypy.accelerate.base.array_utils import gaussian_kernel_2d from . import load_kernel from pycuda import gpuarray @@ -65,6 +66,38 @@ def dot(self, A, B, out=None): def norm2(self, A, out=None): return self.dot(A, A, out) +class BatchedMultiplyKernel: + def __init__(self, array, queue=None, math_type=np.complex64): + self.queue = queue + self.array_shape = array.shape[-2:] + self.batches = int(np.prod(array.shape[0:array.ndim-2]) if array.ndim > 2 else 1) + self.batched_multiply_cuda = load_kernel("batched_multiply", { + 'MPY_DO_SCALE': 'true', + 'MPY_DO_FILT': 'true', + 'IN_TYPE': 'float' if array.dtype==np.complex64 else 'double', + 'OUT_TYPE': 'float' if array.dtype==np.complex64 else 'double', + 'MATH_TYPE': 'float' if math_type==np.complex64 else 'double' + }) + self.block = (32,32,1) + self.grid = ( + int((self.array_shape[0] + 31) // 32), + int((self.array_shape[1] + 31) // 32), + int(self.batches) + ) + + def multiply(self, x,y, scale=1.): + assert x.dtype == y.dtype, "Input arrays must be of same data type" + assert x.shape[-2:] == y.shape[-2:], "Input arrays must be of the same size in last 2 dims" + self.batched_multiply_cuda(x,x,y, + np.float32(scale), + np.int32(self.batches), + np.int32(self.array_shape[0]), + np.int32(self.array_shape[1]), + block=self.block, + grid=self.grid, + stream=self.queue) + + class TransposeKernel: def __init__(self, queue=None): @@ -324,6 +357,62 @@ def delxb(self, input, out, axis=-1): ) +class FFTGaussianSmoothingKernel: + def __init__(self, queue=None, kernel_type='float'): + if kernel_type not in ['float', 'double']: + raise ValueError('Invalid data type for kernel') + self.kernel_type = kernel_type + self.dtype = np.complex64 + self.stype = "complex" + self.queue = queue + self.sigma = None + + from .kernels import FFTFilterKernel + + # Create general FFT filter object + self.fft_filter = FFTFilterKernel(queue_thread=queue) + + def allocate(self, shape, sigma=1.): + + # Create kernel + self.sigma = sigma + kernel = self._compute_kernel(shape, sigma) + + # Allocate filter + kernel_dev = gpuarray.to_gpu(kernel) + self.fft_filter.allocate(kernel=kernel_dev) + + def _compute_kernel(self, shape, sigma): + # Create kernel + sigma = np.array(sigma) + if sigma.size == 1: + sigma = np.array([sigma, sigma]) + if sigma.size > 2: + raise NotImplementedError("Only batches of 2D arrays allowed!") + self.sigma = sigma + + kernel = gaussian_kernel_2d(shape, sigma[0], sigma[1]).astype(self.dtype) + + # Extend kernel if needed + if len(shape) == 3: + kernel = np.tile(kernel, (shape[0],1,1)) + + return kernel + + def filter(self, data, sigma=None): + """ + Apply filter in-place + + If sigma is not None: reallocate a new fft filter first. + """ + if self.sigma is None: + self.allocate(shape=data.shape, sigma=sigma) + else: + self.fft_filter.set_kernel(self._compute_kernel(data.shape, sigma)) + + self.fft_filter.apply_filter(data) + + class GaussianSmoothingKernel: def __init__(self, queue=None, num_stdevs=4, kernel_type='float'): if kernel_type not in ['float', 'double']: diff --git a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py index 33910245..d527d9f1 100644 --- a/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py +++ b/ptypy/accelerate/cuda_pycuda/engines/ML_pycuda.py @@ -24,7 +24,7 @@ from .. import get_context, get_dev_pool from ..kernels import PropagationKernel, RealSupportKernel, FourierSupportKernel from ..kernels import GradientDescentKernel, AuxiliaryWaveKernel, PoUpdateKernel, PositionCorrectionKernel -from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, TransposeKernel +from ..array_utils import ArrayUtilsKernel, DerivativesKernel, GaussianSmoothingKernel, FFTGaussianSmoothingKernel, TransposeKernel from ..mem_utils import GpuDataManager from ptypy.accelerate.base import address_manglers @@ -90,8 +90,12 @@ def engine_initialize(self): self.qu_htod = cuda.Stream() self.qu_dtoh = cuda.Stream() - self.GSK = GaussianSmoothingKernel(queue=self.queue) - self.GSK.tmp = None + if self.p.smooth_gradient_method == "convolution": + self.GSK = GaussianSmoothingKernel(queue=self.queue) + self.GSK.tmp = None + + if self.p.smooth_gradient_method == "fft": + self.FGSK = FFTGaussianSmoothingKernel(queue=self.queue) # Real/Fourier Support Kernel self.RSK = {} @@ -255,9 +259,18 @@ def _set_pr_ob_ref_for_data(self, dev='gpu', container=None, sync_copy=False): self._set_pr_ob_ref_for_data(dev=dev, container=container, sync_copy=sync_copy) def _get_smooth_gradient(self, data, sigma): - if self.GSK.tmp is None: - self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64) - self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + if self.p.smooth_gradient_method == "convolution": + if self.GSK.tmp is None: + self.GSK.tmp = gpuarray.empty(data.shape, dtype=np.complex64) + try: + self.GSK.convolution(data, [sigma, sigma], tmp=self.GSK.tmp) + except MemoryError: + raise RuntimeError("Convolution kernel too large for direct convolution on GPU", + "Please reduce parameter smooth_gradient or set smooth_gradient_method='fft'.") + elif self.p.smooth_gradient_method == "fft": + self.FGSK.filter(data, sigma) + else: + raise NotImplementedError("smooth_gradient_method should be ```convolution``` or ```fft```.") return data def _replace_ob_grad(self): diff --git a/ptypy/accelerate/cuda_pycuda/fft.py b/ptypy/accelerate/cuda_pycuda/fft.py index 916ed7e5..318e46ce 100644 --- a/ptypy/accelerate/cuda_pycuda/fft.py +++ b/ptypy/accelerate/cuda_pycuda/fft.py @@ -11,7 +11,9 @@ def __init__(self, array, queue=None, post_fft=None, symmetric=True, forward=True): - + """ + array should be gpuarray already + """ self._queue = queue from pycuda import gpuarray ## reikna diff --git a/ptypy/accelerate/cuda_pycuda/kernels.py b/ptypy/accelerate/cuda_pycuda/kernels.py index 8f737871..8fcebb6e 100644 --- a/ptypy/accelerate/cuda_pycuda/kernels.py +++ b/ptypy/accelerate/cuda_pycuda/kernels.py @@ -160,6 +160,34 @@ def allocate(self): def apply_real_support(self, x): x *= self.support +class FFTFilterKernel: + def __init__(self, queue_thread=None, fft='reikna'): + # Current implementation recompiles every time there is a change in input shape. + self.queue = queue_thread + self._fft_type = fft + self._fft1 = None + self._fft2 = None + def allocate(self, kernel, prefactor=None, postfactor=None, forward=True): + FFT = choose_fft(self._fft_type, kernel.shape[-2:]) + + self._fft1 = FFT(kernel, self.queue, + pre_fft=prefactor, + post_fft=kernel, + symmetric=True, + forward=forward) + self._fft2 = FFT(kernel, self.queue, + post_fft=postfactor, + symmetric=True, + forward=not forward) + + def set_kernel(self, kernel): + self._fft1.post_fft.set(kernel) + + def apply_filter(self, x): + self._fft1.ft(x, x) + self._fft2.ift(x, x) + + class FourierUpdateKernel(ab.FourierUpdateKernel): def __init__(self, aux, nmodes=1, queue_thread=None, accumulate_type='float', math_type='float'): diff --git a/templates/misc/moonflower_ML_cupy_fft_smoothing.py b/templates/misc/moonflower_ML_cupy_fft_smoothing.py new file mode 100644 index 00000000..f127d2c4 --- /dev/null +++ b/templates/misc/moonflower_ML_cupy_fft_smoothing.py @@ -0,0 +1,64 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines(arch="cupy") + +import tempfile +tmpdir = tempfile.gettempdir() + +p = u.Param() + +# for verbose output +p.verbose_level = "info" +p.frames_per_block = 400 +# set home path +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False) +p.io.interaction = u.Param(active=False) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 100 +p.scans.MF.data.save = None + +p.scans.MF.illumination = u.Param(diversity=None) +p.scans.MF.coherence = u.Param(num_probe_modes=1) +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'ML_cupy' +p.engines.engine00.numiter = 300 +p.engines.engine00.numiter_contiguous = 5 +p.engines.engine00.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer +p.engines.engine00.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used +p.engines.engine00.scale_precond = True +p.engines.engine00.smooth_gradient = 50. +p.engines.engine00.smooth_gradient_decay = 1/50. +p.engines.engine00.smooth_gradient_method = "fft" # with method "convolution" there can be shared memory issues +p.engines.engine00.floating_intensities = False + +# prepare and run +if __name__ == "__main__": + P = Ptycho(p,level=5) diff --git a/templates/misc/moonflower_ML_pycuda_fft_smoothing.py b/templates/misc/moonflower_ML_pycuda_fft_smoothing.py new file mode 100644 index 00000000..d7ea338a --- /dev/null +++ b/templates/misc/moonflower_ML_pycuda_fft_smoothing.py @@ -0,0 +1,64 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses the test Scan class +`ptypy.core.data.MoonFlowerScan` to provide "data". +""" + +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines(arch="pycuda") + +import tempfile +tmpdir = tempfile.gettempdir() + +p = u.Param() + +# for verbose output +p.verbose_level = "info" +p.frames_per_block = 400 +# set home path +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False) +p.io.interaction = u.Param(active=False) + +# max 200 frames (128x128px) of diffraction data +p.scans = u.Param() +p.scans.MF = u.Param() +# now you have to specify which ScanModel to use with scans.XX.name, +# just as you have to give 'name' for engines and PtyScan subclasses. +p.scans.MF.name = 'BlockFull' +p.scans.MF.data= u.Param() +p.scans.MF.data.name = 'MoonFlowerScan' +p.scans.MF.data.shape = 128 +p.scans.MF.data.num_frames = 100 +p.scans.MF.data.save = None + +p.scans.MF.illumination = u.Param(diversity=None) +p.scans.MF.coherence = u.Param(num_probe_modes=1) +# position distance in fraction of illumination frame +p.scans.MF.data.density = 0.2 +# total number of photon in empty beam +p.scans.MF.data.photons = 1e8 +# Gaussian FWHM of possible detector blurring +p.scans.MF.data.psf = 0. + +# attach a reconstrucion engine +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'ML_pycuda' +p.engines.engine00.numiter = 300 +p.engines.engine00.numiter_contiguous = 5 +p.engines.engine00.reg_del2 = True # Whether to use a Gaussian prior (smoothing) regularizer +p.engines.engine00.reg_del2_amplitude = 1. # Amplitude of the Gaussian prior if used +p.engines.engine00.scale_precond = True +p.engines.engine00.smooth_gradient = 50. +p.engines.engine00.smooth_gradient_decay = 1/50. +p.engines.engine00.smooth_gradient_method = "fft" # with method "convolution" there can be shared memory issues +p.engines.engine00.floating_intensities = False + +# prepare and run +if __name__ == "__main__": + P = Ptycho(p,level=5) diff --git a/test/accelerate_tests/base_tests/array_utils_test.py b/test/accelerate_tests/base_tests/array_utils_test.py index 06646b81..364352a5 100644 --- a/test/accelerate_tests/base_tests/array_utils_test.py +++ b/test/accelerate_tests/base_tests/array_utils_test.py @@ -291,6 +291,154 @@ def test_crop_pad_3(self): dtype=np.complex64) np.testing.assert_array_almost_equal(A, exp_A) + def test_fft_filter(self): + data = np.zeros((256, 512), dtype=COMPLEX_TYPE) + data[64:-64,128:-128] = 1 + 1.j + + prefactor = np.zeros_like(data) + prefactor[:,256:] = 1. + postfactor = np.zeros_like(data) + postfactor[128:,:] = 1. + + rk = np.zeros_like(data) + rk[:30, :30] = 1. + kernel = np.fft.fftn(rk) + + output = au.fft_filter(data, kernel, prefactor, postfactor) + + known_test_output = np.array([-0.00000000e+00+0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + -0.00000000e+00 + 0.00000000e+00j, 0.00000000e+00 - 0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, -0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + -0.00000000e+00+0.00000000e+00j, -0.00000000e+00+0.00000000e+00j, + -0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 8.66422277e-14+4.86768828e-14j, + 7.23113320e-14+2.82331542e-14j, 9.00000000e+02+9.00000000e+02j, + 9.00000000e+02+9.00000000e+02j, 5.10000000e+02+5.10000000e+02j, + 1.41172830e-14+3.62223425e-14j, 2.61684238e-14-4.13866575e-14j, + 2.16691314e-14-1.95102733e-14j, -1.36536942e-13-9.94589021e-14j, + -1.42905371e-13-5.77964697e-14j, -5.00005072e-14+4.08620637e-14j, + 6.38160272e-14+7.61753583e-14j, 3.90000000e+02+3.90000000e+02j, + 9.00000000e+02+9.00000000e+02j, 9.00000000e+02+9.00000000e+02j, + 3.00000000e+01+3.00000000e+01j, 8.63255773e-14+7.08532924e-14j, + 1.80941313e-14-3.85517154e-14j, 7.84277340e-14-1.32008745e-14j, + -6.57025196e-14-1.72739350e-14j, -6.69570857e-15+6.49622898e-14j, + 6.27436466e-15+7.57162569e-14j, 2.01150157e-15+3.65538558e-14j, + 8.70000000e+01+8.70000000e+01j, -1.13686838e-13-1.70530257e-13j, + 0.00000000e+00-2.27373675e-13j, -1.84492121e-14-9.21502853e-14j, + 2.12418687e-14-8.62209232e-14j, 1.20880692e-13+3.86522371e-14j, + 1.03754734e-13+9.19851759e-14j, 5.50926123e-14+1.17150422e-13j, + -5.47869215e-14+5.87176511e-14j, -3.52652980e-14+8.44455504e-15j]) + + np.testing.assert_array_almost_equal(output.flat[::2000], known_test_output) + + def test_fft_filter_batched(self): + data = np.zeros((2,256, 512), dtype=COMPLEX_TYPE) + data[:,64:-64,128:-128] = 1 + 1.j + + prefactor = np.zeros_like(data) + prefactor[:,:,256:] = 1. + postfactor = np.zeros_like(data) + postfactor[:,128:,:] = 1. + + rk = np.zeros_like(data)[0] + rk[:30, :30] = 1. + kernel = np.fft.fftn(rk) + + output = au.fft_filter(data, kernel, prefactor, postfactor) + + known_test_output = np.array([-0.00000000e+00+0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + -0.00000000e+00 + 0.00000000e+00j, 0.00000000e+00 - 0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, -0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + -0.00000000e+00+0.00000000e+00j, -0.00000000e+00+0.00000000e+00j, + -0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00+0.00000000e+00j, + 0.00000000e+00+0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 0.00000000e+00-0.00000000e+00j, + 0.00000000e+00-0.00000000e+00j, 8.66422277e-14+4.86768828e-14j, + 7.23113320e-14+2.82331542e-14j, 9.00000000e+02+9.00000000e+02j, + 9.00000000e+02+9.00000000e+02j, 5.10000000e+02+5.10000000e+02j, + 1.41172830e-14+3.62223425e-14j, 2.61684238e-14-4.13866575e-14j, + 2.16691314e-14-1.95102733e-14j, -1.36536942e-13-9.94589021e-14j, + -1.42905371e-13-5.77964697e-14j, -5.00005072e-14+4.08620637e-14j, + 6.38160272e-14+7.61753583e-14j, 3.90000000e+02+3.90000000e+02j, + 9.00000000e+02+9.00000000e+02j, 9.00000000e+02+9.00000000e+02j, + 3.00000000e+01+3.00000000e+01j, 8.63255773e-14+7.08532924e-14j, + 1.80941313e-14-3.85517154e-14j, 7.84277340e-14-1.32008745e-14j, + -6.57025196e-14-1.72739350e-14j, -6.69570857e-15+6.49622898e-14j, + 6.27436466e-15+7.57162569e-14j, 2.01150157e-15+3.65538558e-14j, + 8.70000000e+01+8.70000000e+01j, -1.13686838e-13-1.70530257e-13j, + 0.00000000e+00-2.27373675e-13j, -1.84492121e-14-9.21502853e-14j, + 2.12418687e-14-8.62209232e-14j, 1.20880692e-13+3.86522371e-14j, + 1.03754734e-13+9.19851759e-14j, 5.50926123e-14+1.17150422e-13j, + -5.47869215e-14+5.87176511e-14j, -3.52652980e-14+8.44455504e-15j]) + + np.testing.assert_array_almost_equal(output[1].flat[::2000], known_test_output) + + + def test_complex_gaussian_filter_fft(self): + data = np.zeros((8, 8), dtype=COMPLEX_TYPE) + data[3:5, 3:5] = 2.0 + 2.0j + mfs = 3.0, 4.0 + + out = au.complex_gaussian_filter_fft(data, mfs) + expected_out = np.array([0.11033735 + 0.11033735j, 0.11888228 + 0.11888228j, 0.13116673 + 0.13116673j + , 0.13999543 + 0.13999543j, 0.13999543 + 0.13999543j, 0.13116673 + 0.13116673j + , 0.11888228 + 0.11888228j, 0.11033735 + 0.11033735j], dtype=COMPLEX_TYPE) + np.testing.assert_array_almost_equal(np.diagonal(out), expected_out, decimal=5) + + def test_complex_gaussian_filter_fft_batched(self): + batch_number = 2 + A = 5 + B = 5 + + data = np.zeros((batch_number, A, B), dtype=COMPLEX_TYPE) + data[:, 2:3, 2:3] = 2.0 + 2.0j + mfs = 3.0, 4.0 + out = au.complex_gaussian_filter_fft(data, mfs) + + expected_out = np.array([[[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j, + 0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]], + + [[0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.08012911 + 0.08012911j, 0.08013555 + 0.08013555j, 0.08013615 + 0.08013615j, + 0.08013555 + 0.08013555j, 0.08012911 + 0.08012911j], + [0.08003781 + 0.08003781j, 0.08004424 + 0.08004424j, 0.08004485 + 0.08004485j, + 0.08004424 + 0.08004424j, 0.08003781 + 0.08003781j], + [0.07988770 + 0.0798877j, 0.07989411 + 0.07989411j, 0.07989471 + 0.07989471j, + 0.07989411 + 0.07989411j, 0.07988770 + 0.0798877j]]], dtype=COMPLEX_TYPE) + + np.testing.assert_array_almost_equal(out, expected_out, decimal=5) + if __name__ == '__main__': unittest.main() diff --git a/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py index 0c018b20..bc245ca7 100644 --- a/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_cupy_tests/array_utils_test.py @@ -6,6 +6,7 @@ if have_cupy(): import cupy as cp import ptypy.accelerate.cuda_cupy.array_utils as gau + from ptypy.accelerate.cuda_cupy.kernels import FFTFilterKernel class ArrayUtilsTest(CupyCudaTest): @@ -78,6 +79,23 @@ def test_dot_performance(self): AU = gau.ArrayUtilsKernel(acc_dtype=np.float64) AU.dot(A_dev, A_dev) + def test_batched_multiply(self): + # Arrange + sh = (3,14,24) + ksh = (14,24) + data = (np.random.random(sh) + 1j* np.random.random(sh)).astype(np.complex64) + kernel = (np.random.random(ksh) + 1j* np.random.random(ksh)).astype(np.complex64) + data_dev = cp.asarray(data) + kernel_dev = cp.asarray(kernel) + + # Act + BM = gau.BatchedMultiplyKernel(data_dev) + BM.multiply(data_dev, kernel_dev, scale=2.) + + # Assert + expected = data * kernel * 2. + np.testing.assert_array_almost_equal(data_dev.get(), expected) + def test_transpose_2D(self): # Arrange inp, _ = np.indices((5, 3), dtype=np.int32) @@ -534,3 +552,126 @@ def test_interpolate_shift_no_shift_UNITY(self): np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6, err_msg="The shifting of array has not been calculated as expected") + def test_fft_filter_UNITY(self): + sh = (32, 48) + data = np.zeros(sh, dtype=np.complex64) + data.flat[:] = np.arange(np.prod(sh)) + kernel = np.zeros_like(data) + kernel[0, 0] = 1. + kernel[0, 1] = 0.5 + + prefactor = np.zeros_like(data) + prefactor[:,2:] = 1. + postfactor = np.zeros_like(data) + postfactor[2:,:] = 1. + + data_dev = cp.asarray(data) + kernel_dev = cp.asarray(kernel) + pre_dev = cp.asarray(prefactor) + post_dev = cp.asarray(postfactor) + + FF = FFTFilterKernel(queue_thread=self.stream) + FF.allocate(kernel=kernel_dev, prefactor=pre_dev, postfactor=post_dev) + FF.apply_filter(data_dev) + + output = au.fft_filter(data, kernel, prefactor, postfactor) + + np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6) + + def test_fft_filter_batched_UNITY(self): + sh = (2,16, 35) + data = np.zeros(sh, dtype=np.complex64) + data.flat[:] = np.arange(np.prod(sh)) + kernel = np.zeros_like(data) + kernel[:,0, 0] = 1. + kernel[:,0, 1] = 0.5 + + prefactor = np.zeros_like(data) + prefactor[:,:,2:] = 1. + postfactor = np.zeros_like(data) + postfactor[:,2:,:] = 1. + + data_dev = cp.asarray(data) + kernel_dev = cp.asarray(kernel) + pre_dev = cp.asarray(prefactor) + post_dev = cp.asarray(postfactor) + + FF = FFTFilterKernel(queue_thread=self.stream) + FF.allocate(kernel=kernel_dev, prefactor=pre_dev, postfactor=post_dev) + FF.apply_filter(data_dev) + + output = au.fft_filter(data, kernel, prefactor, postfactor) + np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6) + + def test_complex_gaussian_filter_fft_little_blurring_UNITY(self): + # Arrange + data = np.zeros((21, 21), dtype=np.complex64) + data[10:12, 10:12] = 2.0+2.0j + mfs = 0.2,0.2 + data_dev = cp.asarray(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) + + def test_complex_gaussian_filter_fft_more_blurring_UNITY(self): + # Arrange + data = np.zeros((8, 8), dtype=np.complex64) + data[3:5, 3:5] = 2.0+2.0j + mfs = 3.0,4.0 + data_dev = cp.asarray(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) + + def test_complex_gaussian_filter_fft_nonsquare_UNITY(self): + # Arrange + data = np.zeros((32, 16), dtype=np.complex64) + data[3:4, 11:12] = 2.0+2.0j + data[3:5, 3:5] = 2.0+2.0j + data[20:25,3:5] = 2.0+2.0j + mfs = 1.0,1.0 + data_dev = cp.asarray(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) + + def test_complex_gaussian_filter_fft_batched_UNITY(self): + # Arrange + batch_number = 2 + A = 5 + B = 5 + data = np.zeros((batch_number, A, B), dtype=np.complex64) + data[:, 2:3, 2:3] = 2.0+2.0j + mfs = 3.0,4.0 + data_dev = cp.asarray(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py index 912b68bf..be82cc29 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/array_utils_test.py @@ -11,6 +11,7 @@ if have_pycuda(): from pycuda import gpuarray import ptypy.accelerate.cuda_pycuda.array_utils as gau + from ptypy.accelerate.cuda_pycuda.kernels import FFTFilterKernel class ArrayUtilsTest(PyCudaTest): @@ -81,6 +82,23 @@ def test_dot_performance(self): AU = gau.ArrayUtilsKernel(acc_dtype=np.float64) out_dev = AU.dot(A_dev, A_dev) + def test_batched_multiply(self): + # Arrange + sh = (3,14,24) + ksh = (14,24) + data = (np.random.random(sh) + 1j* np.random.random(sh)).astype(np.complex64) + kernel = (np.random.random(ksh) + 1j* np.random.random(ksh)).astype(np.complex64) + data_dev = gpuarray.to_gpu(data) + kernel_dev = gpuarray.to_gpu(kernel) + + # Act + BM = gau.BatchedMultiplyKernel(data_dev) + BM.multiply(data_dev, kernel_dev, scale=2.) + + # Assert + expected = data * kernel * 2. + np.testing.assert_array_almost_equal(data_dev.get(), expected) + def test_transpose_2D(self): ## Arrange inp,_ = np.indices((5,3), dtype=np.int32) @@ -538,3 +556,128 @@ def test_interpolate_shift_no_shift_UNITY(self): np.testing.assert_allclose(out, isk, rtol=1e-6, atol=1e-6, err_msg="The shifting of array has not been calculated as expected") + def test_fft_filter_UNITY(self): + sh = (16, 35) + data = np.zeros(sh, dtype=np.complex64) + data.flat[:] = np.arange(np.prod(sh)) + kernel = np.zeros_like(data) + kernel[0, 0] = 1. + kernel[0, 1] = 0.5 + + prefactor = np.zeros_like(data) + prefactor[:,2:] = 1. + postfactor = np.zeros_like(data) + postfactor[2:,:] = 1. + + data_dev = gpuarray.to_gpu(data) + kernel_dev = gpuarray.to_gpu(kernel) + pre_dev = gpuarray.to_gpu(prefactor) + post_dev = gpuarray.to_gpu(postfactor) + + FF = FFTFilterKernel(queue_thread=self.stream) + FF.allocate(kernel=kernel_dev, prefactor=pre_dev, postfactor=post_dev) + FF.apply_filter(data_dev) + + output = au.fft_filter(data, kernel, prefactor, postfactor) + + np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6) + + def test_fft_filter_batched_UNITY(self): + sh = (2,16, 35) + data = np.zeros(sh, dtype=np.complex64) + data.flat[:] = np.arange(np.prod(sh)) + kernel = np.zeros_like(data) + kernel[:,0, 0] = 1. + kernel[:,0, 1] = 0.5 + + prefactor = np.zeros_like(data) + prefactor[:,:,2:] = 1. + postfactor = np.zeros_like(data) + postfactor[:,2:,:] = 1. + + data_dev = gpuarray.to_gpu(data) + kernel_dev = gpuarray.to_gpu(kernel) + pre_dev = gpuarray.to_gpu(prefactor) + post_dev = gpuarray.to_gpu(postfactor) + + FF = FFTFilterKernel(queue_thread=self.stream) + FF.allocate(kernel=kernel_dev, prefactor=pre_dev, postfactor=post_dev) + FF.apply_filter(data_dev) + + output = au.fft_filter(data, kernel, prefactor, postfactor) + print(data_dev.get()) + + np.testing.assert_allclose(output, data_dev.get(), rtol=1e-5, atol=1e-6) + + def test_complex_gaussian_filter_fft_little_blurring_UNITY(self): + # Arrange + data = np.zeros((21, 21), dtype=np.complex64) + data[10:12, 10:12] = 2.0+2.0j + mfs = 0.2,0.2 + data_dev = gpuarray.to_gpu(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) + + def test_complex_gaussian_filter_fft_more_blurring_UNITY(self): + # Arrange + data = np.zeros((8, 8), dtype=np.complex64) + data[3:5, 3:5] = 2.0+2.0j + mfs = 3.0,4.0 + data_dev = gpuarray.to_gpu(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) + + def test_complex_gaussian_filter_fft_nonsquare_UNITY(self): + # Arrange + data = np.zeros((32, 16), dtype=np.complex64) + data[3:4, 11:12] = 2.0+2.0j + data[3:5, 3:5] = 2.0+2.0j + data[20:25,3:5] = 2.0+2.0j + mfs = 1.0,1.0 + data_dev = gpuarray.to_gpu(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) + + def test_complex_gaussian_filter_fft_batched(self): + # Arrange + batch_number = 2 + A = 5 + B = 5 + data = np.zeros((batch_number, A, B), dtype=np.complex64) + data[:, 2:3, 2:3] = 2.0+2.0j + mfs = 3.0,4.0 + data_dev = gpuarray.to_gpu(data) + + # Act + FGSK = gau.FFTGaussianSmoothingKernel(queue=self.stream) + FGSK.filter(data_dev, mfs) + + # Assert + out_exp = au.complex_gaussian_filter_fft(data, mfs) + out = data_dev.get() + + np.testing.assert_allclose(out_exp, out, atol=1e-6) \ No newline at end of file diff --git a/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py b/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py index 5816e3bf..c30c08e9 100644 --- a/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py +++ b/test/accelerate_tests/cuda_pycuda_tests/fft_setstream_test.py @@ -95,5 +95,6 @@ def test_set_stream_a_reikna(self): def test_set_stream_b_cufft(self): self.helper(cuFFT) + @unittest.skip("Skcuda is currently broken") def test_set_stream_c_skcuda_cufft(self): self.helper(SkcudaCuFFT) From a1bde3e6137c50469f7e65162f67fa6a19a74e9f Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Fri, 8 Mar 2024 17:55:44 +0000 Subject: [PATCH 12/14] add more accelerated cupy templates (#547) --- ptypy/utils/scripts.py | 2 +- .../ptypy_i13_AuStar_farfield_cupy.py | 116 +++++++++++++++++ .../ptypy_i13_AuStar_nearfield_cupy.py | 118 +++++++++++++++++ .../ptypy_id22ni_AuStar_focused_cupy.py | 120 ++++++++++++++++++ .../ptypy_laser_logo_focused_cupy.py | 100 +++++++++++++++ .../ptypy_laser_logo_focused_pycuda.py | 2 +- templates/ptypy_laser_logo_focused.py | 2 +- 7 files changed, 457 insertions(+), 3 deletions(-) create mode 100644 templates/accelerate/ptypy_i13_AuStar_farfield_cupy.py create mode 100644 templates/accelerate/ptypy_i13_AuStar_nearfield_cupy.py create mode 100644 templates/accelerate/ptypy_id22ni_AuStar_focused_cupy.py create mode 100644 templates/accelerate/ptypy_laser_logo_focused_cupy.py diff --git a/ptypy/utils/scripts.py b/ptypy/utils/scripts.py index 97e3248b..f438bd22 100644 --- a/ptypy/utils/scripts.py +++ b/ptypy/utils/scripts.py @@ -728,7 +728,7 @@ def phase_from_dpc(dpc_row, dpc_col): sh = np.asarray(sh) fac = np.ones_like(sh) fac[-2:] = 2 - f = np.zeros(sh * fac, dtype=np.complex) + f = np.zeros(sh * fac, dtype=complex) c = px + 1j*py f[..., :sh[-2], :sh[-1]] = c f[..., :sh[-2], sh[-1]:] = c[..., :, ::-1] diff --git a/templates/accelerate/ptypy_i13_AuStar_farfield_cupy.py b/templates/accelerate/ptypy_i13_AuStar_farfield_cupy.py new file mode 100644 index 00000000..68800f6b --- /dev/null +++ b/templates/accelerate/ptypy_i13_AuStar_farfield_cupy.py @@ -0,0 +1,116 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses a simulated Au Siemens star pattern under +experimental farfield conditions in the hard X-ray regime. +""" +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines(arch="cupy") + +import tempfile +tmpdir = tempfile.gettempdir() + +### PTYCHO PARAMETERS +p = u.Param() +p.verbose_level = "info" +p.run = None + +p.data_type = "single" +p.run = None +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False, layout="nearfield") +p.io.interaction = u.Param(active=False) + +# Simulation parameters +sim = u.Param() +sim.energy = 9.7 +sim.distance = 8.46e-2 +sim.psize = 100e-9 +sim.shape = 1024 +sim.xy = u.Param() +sim.xy.override = u.parallel.MPIrand_uniform(0.0,10e-6,(20,2)) +#sim.xy.positions = np.random.normal(0.0,3e-6,(20,2)) +sim.verbose_level = 1 + +sim.illumination = u.Param() +sim.illumination.model = None +sim.illumination.photons = 1e11 +sim.illumination.aperture = u.Param() +sim.illumination.aperture.diffuser = (8.0, 10.0) +sim.illumination.aperture.form = "circ" +sim.illumination.aperture.size = 90e-6 +sim.illumination.aperture.central_stop = 0.15 +sim.illumination.propagation = u.Param() +sim.illumination.propagation.focussed = None#0.08 +sim.illumination.propagation.parallel = 0.005 +sim.illumination.propagation.spot_size = None + +sim.sample = u.Param() +sim.sample.model = u.xradia_star((1200,1200),minfeature=3,contrast=0.8) +sim.sample.process = u.Param() +sim.sample.process.offset = (0,0) +sim.sample.process.zoom = 1.0 +sim.sample.process.formula = "Au" +sim.sample.process.density = 19.3 +sim.sample.process.thickness = 700e-9 +sim.sample.process.ref_index = None +sim.sample.process.smoothing = None +sim.sample.fill = 1.0+0.j + +sim.detector = 'GenericCCD32bit' +sim.plot = False + +# Scan model and initial value parameters +p.scans = u.Param() +p.scans.scan00 = u.Param() +p.scans.scan00.name = 'BlockFull' + +p.scans.scan00.coherence = u.Param() +p.scans.scan00.coherence.num_probe_modes = 1 +p.scans.scan00.coherence.num_object_modes = 1 +p.scans.scan00.coherence.energies = [1.0] + +p.scans.scan00.sample = u.Param() + +# (copy simulation illumination and modify some things) +p.scans.scan00.illumination = sim.illumination.copy(99) +p.scans.scan00.illumination.aperture.size = 105e-6 +p.scans.scan00.illumination.aperture.central_stop = None + +# Scan data (simulation) parameters +p.scans.scan00.data=u.Param() +p.scans.scan00.data.name = 'SimScan' +p.scans.scan00.data.propagation = 'nearfield' +p.scans.scan00.data.save = None #'append' +p.scans.scan00.data.shape = None +p.scans.scan00.data.num_frames = None +p.scans.scan00.data.update(sim) + +# Reconstruction parameters +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_cupy' +p.engines.engine00.numiter = 100 +p.engines.engine00.object_inertia = 1. +p.engines.engine00.numiter_contiguous = 1 +p.engines.engine00.probe_support = None +p.engines.engine00.probe_inertia = 0.001 +p.engines.engine00.obj_smooth_std = 10 +p.engines.engine00.clip_object = None +p.engines.engine00.alpha = 1 +p.engines.engine00.probe_update_start = 2 +p.engines.engine00.update_object_first = True +p.engines.engine00.overlap_converge_factor = 0.5 +p.engines.engine00.overlap_max_iterations = 100 +p.engines.engine00.fourier_relax_factor = 0.05 + +p.engines.engine01 = u.Param() +p.engines.engine01.name = 'ML_cupy' +p.engines.engine01.numiter = 50 + +if __name__ == "__main__": + P = Ptycho(p,level=5) + diff --git a/templates/accelerate/ptypy_i13_AuStar_nearfield_cupy.py b/templates/accelerate/ptypy_i13_AuStar_nearfield_cupy.py new file mode 100644 index 00000000..9e92fec1 --- /dev/null +++ b/templates/accelerate/ptypy_i13_AuStar_nearfield_cupy.py @@ -0,0 +1,118 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses a simulated Au Siemens star pattern under +experimental nearfield conditions in the hard X-ray regime. +""" +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines(arch="cupy") + +import tempfile +tmpdir = tempfile.gettempdir() + +### PTYCHO PARAMETERS +p = u.Param() +p.verbose_level = "info" +p.run = None +p.frames_per_block = 20 + +p.data_type = "single" +p.run = None +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False, layout="nearfield") +p.io.interaction = u.Param(active=False) + +# Simulation parameters +sim = u.Param() +sim.energy = 9.7 +sim.distance = 8.46e-2 +sim.psize = 100e-9 +sim.shape = 1024 +sim.xy = u.Param() +sim.xy.override = u.parallel.MPIrand_uniform(0.0,10e-6,(20,2)) +#sim.xy.positions = np.random.normal(0.0,3e-6,(20,2)) +sim.verbose_level = 1 + +sim.illumination = u.Param() +sim.illumination.model = None +sim.illumination.photons = 1e11 +sim.illumination.aperture = u.Param() +sim.illumination.aperture.diffuser = (8.0, 10.0) +sim.illumination.aperture.form = "circ" +sim.illumination.aperture.size = 90e-6 +sim.illumination.aperture.central_stop = 0.15 +sim.illumination.propagation = u.Param() +sim.illumination.propagation.focussed = None#0.08 +sim.illumination.propagation.parallel = 0.005 +sim.illumination.propagation.spot_size = None + +sim.sample = u.Param() +sim.sample.model = u.xradia_star((1200,1200),minfeature=3,contrast=0.8) +sim.sample.process = u.Param() +sim.sample.process.offset = (0,0) +sim.sample.process.zoom = 1.0 +sim.sample.process.formula = "Au" +sim.sample.process.density = 19.3 +sim.sample.process.thickness = 700e-9 +sim.sample.process.ref_index = None +sim.sample.process.smoothing = None +sim.sample.fill = 1.0+0.j + +sim.detector = 'GenericCCD32bit' +sim.plot = False + +# Scan model and initial value parameters +p.scans = u.Param() +p.scans.scan00 = u.Param() +p.scans.scan00.name = 'BlockFull' +p.scans.scan00.propagation = "nearfield" + +p.scans.scan00.coherence = u.Param() +p.scans.scan00.coherence.num_probe_modes = 1 +p.scans.scan00.coherence.num_object_modes = 1 +p.scans.scan00.coherence.energies = [1.0] + +p.scans.scan00.sample = u.Param() + +# (copy simulation illumination and modify some things) +p.scans.scan00.illumination = sim.illumination.copy(99) +p.scans.scan00.illumination.aperture.size = 105e-6 +p.scans.scan00.illumination.aperture.central_stop = None + +# Scan data (simulation) parameters +p.scans.scan00.data=u.Param() +p.scans.scan00.data.name = 'SimScan' +p.scans.scan00.data.propagation = 'nearfield' +p.scans.scan00.data.save = None #'append' +p.scans.scan00.data.shape = None +p.scans.scan00.data.num_frames = None +p.scans.scan00.data.update(sim) + +# Reconstruction parameters +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_cupy' +p.engines.engine00.numiter = 100 +p.engines.engine00.object_inertia = 1. +p.engines.engine00.numiter_contiguous = 1 +p.engines.engine00.probe_support = None +p.engines.engine00.probe_inertia = 0.001 +p.engines.engine00.obj_smooth_std = 10 +p.engines.engine00.clip_object = None +p.engines.engine00.alpha = 1 +p.engines.engine00.probe_update_start = 2 +p.engines.engine00.update_object_first = True +p.engines.engine00.overlap_converge_factor = 0.5 +p.engines.engine00.overlap_max_iterations = 100 +p.engines.engine00.fourier_relax_factor = 0.05 + +p.engines.engine01 = u.Param() +p.engines.engine01.name = 'ML_cupy' +p.engines.engine01.numiter = 50 + +if __name__ == "__main__": + P = Ptycho(p,level=5) + diff --git a/templates/accelerate/ptypy_id22ni_AuStar_focused_cupy.py b/templates/accelerate/ptypy_id22ni_AuStar_focused_cupy.py new file mode 100644 index 00000000..2a4b31c4 --- /dev/null +++ b/templates/accelerate/ptypy_id22ni_AuStar_focused_cupy.py @@ -0,0 +1,120 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses a simulated Au Siemens star pattern under +experimental farfield conditions and with a focused beam in the hard X-ray regime. +""" +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy +ptypy.load_gpu_engines(arch="cupy") + +import numpy as np +import tempfile +tmpdir = tempfile.gettempdir() + +p = u.Param() + +### PTYCHO PARAMETERS +p.verbose_level = "info" + +p.data_type = "single" +p.run = None + +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False, layout="weak") +p.io.interaction = u.Param(active=False) + +# Simulation parameters +sim = u.Param() +sim.energy = 17.0 +sim.distance = 2.886 +sim.psize = 51e-6 +sim.shape = 256 +sim.xy = u.Param() +sim.xy.model = "round" +sim.xy.spacing = 250e-9 +sim.xy.steps = 30 +sim.xy.extent = 4e-6 + +sim.illumination = u.Param() +sim.illumination.model = None +sim.illumination.photons = 3e8 +sim.illumination.aperture = u.Param() +sim.illumination.aperture.diffuser = None +sim.illumination.aperture.form = "rect" +sim.illumination.aperture.size = 35e-6 +sim.illumination.aperture.central_stop = None +sim.illumination.propagation = u.Param() +sim.illumination.propagation.focussed = 0.08 +sim.illumination.propagation.parallel = 0.0014 +sim.illumination.propagation.spot_size = None + +sim.sample = u.Param() +sim.sample.model = u.xradia_star((1000,1000),minfeature=3,contrast=0.0) +sim.sample.process = u.Param() +sim.sample.process.offset = (100,100) +sim.sample.process.zoom = 1.0 +sim.sample.process.formula = "Au" +sim.sample.process.density = 19.3 +sim.sample.process.thickness = 2000e-9 +sim.sample.process.ref_index = None +sim.sample.process.smoothing = None +sim.sample.fill = 1.0+0.j + +#sim.detector = 'FRELON_TAPER' +sim.detector = 'GenericCCD32bit' +sim.verbose_level = 1 +sim.psf = 1. # emulates partial coherence +sim.plot = False + +# Scan model and initial value parameters +p.scans = u.Param() +p.scans.scan00 = u.Param() +p.scans.scan00.name = 'BlockFull' + +p.scans.scan00.coherence = u.Param() +p.scans.scan00.coherence.num_probe_modes = 4 +p.scans.scan00.coherence.num_object_modes = 1 +p.scans.scan00.coherence.energies = [1.0] + +p.scans.scan00.sample = u.Param() +p.scans.scan00.sample.model = 'stxm' +p.scans.scan00.sample.process = None + +# (copy the simulation illumination and change specific things) +p.scans.scan00.illumination = sim.illumination.copy(99) +p.scans.scan00.illumination.aperture.form = 'circ' +p.scans.scan00.illumination.propagation.focussed = 0.06 +p.scans.scan00.illumination.diversity = u.Param() +p.scans.scan00.illumination.diversity.power = 0.1 +p.scans.scan00.illumination.diversity.noise = (np.pi,3.0) + +# Scan data (simulation) parameters +p.scans.scan00.data = u.Param() +p.scans.scan00.data.name = 'SimScan' +p.scans.scan00.data.update(sim) +p.scans.scan00.data.save = None + +# Reconstruction parameters +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_cupy' +p.engines.engine00.numiter = 150 +p.engines.engine00.fourier_relax_factor = 0.05 +p.engines.engine00.numiter_contiguous = 1 +p.engines.engine00.probe_support = 0.7 +p.engines.engine00.probe_inertia = 0.01 +p.engines.engine00.object_inertia = 0.1 +p.engines.engine00.clip_object = (0, 1.) +p.engines.engine00.alpha = 1 +p.engines.engine00.probe_update_start = 2 +p.engines.engine00.update_object_first = True +p.engines.engine00.overlap_converge_factor = 0.05 +p.engines.engine00.overlap_max_iterations = 100 +p.engines.engine00.obj_smooth_std = 5 + +u.verbose.set_level("info") +if __name__ == "__main__": + P = Ptycho(p,level=5) diff --git a/templates/accelerate/ptypy_laser_logo_focused_cupy.py b/templates/accelerate/ptypy_laser_logo_focused_cupy.py new file mode 100644 index 00000000..05d999a3 --- /dev/null +++ b/templates/accelerate/ptypy_laser_logo_focused_cupy.py @@ -0,0 +1,100 @@ +""" +This script is a test for ptychographic reconstruction in the absence +of actual data. It uses a simulated Au Siemens star pattern under +experimental farfield conditions and with a focused optical laser beam. +""" +from ptypy.core import Ptycho +from ptypy import utils as u +import ptypy.simulations as sim +import ptypy +ptypy.load_gpu_engines(arch="cupy") + +import pathlib +import numpy as np +import tempfile +tmpdir = tempfile.gettempdir() + +### PTYCHO PARAMETERS +p = u.Param() +p.verbose_level = "info" +p.data_type = "single" + +p.run = None +p.io = u.Param() +p.io.home = "/".join([tmpdir, "ptypy"]) +p.io.autosave = u.Param(active=False) +p.io.autoplot = u.Param(active=False, layout="minimal") +p.io.interaction = u.Param(active=False) + +# Simulation parameters +sim = u.Param() +sim.energy = u.keV2m(1.0)/6.32e-7 +sim.distance = 15e-2 +sim.psize = 24e-6 +sim.shape = 256 +sim.xy = u.Param() +sim.xy.model = "round" +sim.xy.spacing = 0.3e-3 +sim.xy.steps = 60 +sim.xy.extent = (5e-3,10e-3) + +sim.illumination = u.Param() +sim.illumination.model = None +sim.illumination.photons = int(1e9) +sim.illumination.aperture = u.Param() +sim.illumination.aperture.diffuser = None#(0.7,3) +sim.illumination.aperture.form = "circ" +sim.illumination.aperture.size = 1.0e-3 +sim.illumination.aperture.edge = 2 +sim.illumination.aperture.central_stop = None +sim.illumination.propagation = u.Param() +sim.illumination.propagation.focussed = None +sim.illumination.propagation.parallel = 0.03 +sim.illumination.propagation.spot_size = None + +imgfile = "/".join([str(pathlib.Path(__file__).parent.resolve()), '../../ptypy/resources/ptypy_logo_1M.png']) +sim.sample = u.Param() +sim.sample.model = -u.rgb2complex(u.imload(imgfile)[::-1,:,:-1]) +sim.sample.process = u.Param() +sim.sample.process.offset = (0,0) +sim.sample.process.zoom = 0.5 +sim.sample.process.formula = None +sim.sample.process.density = None +sim.sample.process.thickness = None +sim.sample.process.ref_index = None +sim.sample.process.smoothing = None +sim.sample.fill = 1.0+0.j +sim.plot=False +sim.detector = u.Param(dtype=np.uint32,full_well=2**32-1,psf=None) + +# Scan model and initial value parameters +p.scans = u.Param() +p.scans.ptypy = u.Param() +p.scans.ptypy.name = 'BlockFull' + +p.scans.ptypy.coherence = u.Param() +p.scans.ptypy.coherence.num_probe_modes=1 + +p.scans.ptypy.illumination = u.Param() +p.scans.ptypy.illumination.model=None +p.scans.ptypy.illumination.aperture = u.Param() +p.scans.ptypy.illumination.aperture.diffuser = None +p.scans.ptypy.illumination.aperture.form = "circ" +p.scans.ptypy.illumination.aperture.size = 1.0e-3 +p.scans.ptypy.illumination.aperture.edge = 10 + +# Scan data (simulation) parameters +p.scans.ptypy.data = u.Param() +p.scans.ptypy.data.name = 'SimScan' +p.scans.ptypy.data.update(sim) + +# Reconstruction parameters +p.engines = u.Param() +p.engines.engine00 = u.Param() +p.engines.engine00.name = 'DM_cupy' +p.engines.engine00.numiter = 40 +p.engines.engine00.fourier_relax_factor = 0.05 + +u.verbose.set_level("info") +if __name__ == "__main__": + P = Ptycho(p,level=5) diff --git a/templates/accelerate/ptypy_laser_logo_focused_pycuda.py b/templates/accelerate/ptypy_laser_logo_focused_pycuda.py index 3d0b44b7..f9be2ce3 100644 --- a/templates/accelerate/ptypy_laser_logo_focused_pycuda.py +++ b/templates/accelerate/ptypy_laser_logo_focused_pycuda.py @@ -52,7 +52,7 @@ sim.illumination.propagation.parallel = 0.03 sim.illumination.propagation.spot_size = None -imgfile = "/".join([str(pathlib.Path(__file__).parent.resolve()), '../../resources/ptypy_logo_1M.png']) +imgfile = "/".join([str(pathlib.Path(__file__).parent.resolve()), '../../ptypy/resources/ptypy_logo_1M.png']) sim.sample = u.Param() sim.sample.model = -u.rgb2complex(u.imload(imgfile)[::-1,:,:-1]) sim.sample.process = u.Param() diff --git a/templates/ptypy_laser_logo_focused.py b/templates/ptypy_laser_logo_focused.py index 63fbb082..ae71c1e3 100644 --- a/templates/ptypy_laser_logo_focused.py +++ b/templates/ptypy_laser_logo_focused.py @@ -50,7 +50,7 @@ sim.illumination.propagation.parallel = 0.03 sim.illumination.propagation.spot_size = None -imgfile = "/".join([str(pathlib.Path(__file__).parent.resolve()), '../resources/ptypy_logo_1M.png']) +imgfile = "/".join([str(pathlib.Path(__file__).parent.resolve()), '../ptypy/resources/ptypy_logo_1M.png']) sim.sample = u.Param() sim.sample.model = -u.rgb2complex(u.imload(imgfile)[::-1,:,:-1]) sim.sample.process = u.Param() From ab92257ae0968383aa21670f334eb1640d31ffa9 Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Mon, 11 Mar 2024 10:46:05 +0000 Subject: [PATCH 13/14] Update docs for 0.8.1 (#545) * pump version to 0.8.1 * Updated documentation * improved installation instructions * changed filtered cufft instructions to cupy env * small changes in text --- README.rst | 14 ++++++- doc/html_templates/ptypysphinx/download.html | 3 +- doc/index.rst | 3 +- doc/rst_templates/getting_started.tmp | 42 +++++++++++++++----- ptypy/version.py | 4 +- 5 files changed, 50 insertions(+), 16 deletions(-) diff --git a/README.rst b/README.rst index bafea120..9827ddf5 100644 --- a/README.rst +++ b/README.rst @@ -62,7 +62,8 @@ Features $ mpiexec -n [nodes] python .py -* **GPU acceleration** based on custom kernels, pycuda, and reikna. +* **GPU acceleration** based on custom kernels, CuPy or PyCUDA/reikna. + See examples in ``templates/accelerate``, ``templates/engines/cupy`` and ``templates/engines/pycuda``. * A **client-server** approach for visualization and control based on `ZeroMQ `_ . @@ -100,6 +101,17 @@ Ptypy depends on standard python packages: * mpi4py (optional - required for parallel computing) * pyzmq (optional - required for the plotting client) +GPU support +----------- + +We support an accelerated version of |ptypy| for CUDA-capable GPUs based on our own kernels and the +`CuPy `_ package. We recommend to install the dependencies for this version like so. +:: + + $ conda env create -f accelerate/cuda_cupy/dependencies.yml + $ conda activate ptypy_cupy + (ptypy_cupy)$ pip install . + Quicklinks ---------- diff --git a/doc/html_templates/ptypysphinx/download.html b/doc/html_templates/ptypysphinx/download.html index 0bdeb6e7..f7ff1f31 100644 --- a/doc/html_templates/ptypysphinx/download.html +++ b/doc/html_templates/ptypysphinx/download.html @@ -20,7 +20,8 @@

If you are having trouble getting ptypy up und running please let us know ( Pierre or -Bjoern). +Bjoern or +Benedikt).

You are also encouraged to file any issue, bug or request at the issue tracker.

diff --git a/doc/index.rst b/doc/index.rst index 317f5cf5..07c48ad0 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -35,7 +35,8 @@ Highlights $ mpiexec/mpirun -n [nodes] python .py -* **GPU acceleration** based on custom kernels, pycuda, and reikna. +* **GPU acceleration** based on custom kernels, CuPy or PyCUDA/reikna. + See examples in ``templates/accelerate``, ``templates/engines/cupy`` and ``templates/engines/pycuda``. * A **client-server** approach for visualization and control based on `ZeroMQ `_ . diff --git a/doc/rst_templates/getting_started.tmp b/doc/rst_templates/getting_started.tmp index 851734eb..5d1d83d6 100644 --- a/doc/rst_templates/getting_started.tmp +++ b/doc/rst_templates/getting_started.tmp @@ -86,11 +86,26 @@ Install the recommended version like so (ptypy_full)$ pip install . -Recommended install for GPU support -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Recommended install for GPU support with CuPy +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ We support an accelerated version of |ptypy|_ for CUDA-capable GPUs based on our own kernels and the +`CuPy `_ package. + +Install the dependencies for this version like so. +:: + + $ conda env create -f ptypy/accelerate/cuda_cupy/dependencies.yml + $ conda activate ptypy_cupy + (ptypy_cupy)$ pip install . + + +Install for GPU support with PyCUDA +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Alternatively, we also support an accelerated version of |ptypy|_ for CUDA-capable +GPUs based on our own kernels and the `PyCUDA `_ package. Install the dependencies for this version like so. @@ -101,17 +116,22 @@ Install the dependencies for this version like so. (ptypy_pycuda)$ pip install . -While we use `Reikna `_ to -provide a filtered FFT, i.e. a FFT that is fused with a pointwise -matrix multiplication, you can optionally also install a version -based on cufft and callbacks. Due to the nature of this extension it -needs to be built for fixed array sizes externally. +We use `Reikna `_ to +provide a filtered FFT, i.e. a FFT that is fused with a pointwise matrix multiplication. + +Optional installation of filtered cufft +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For optimal performance with both CuPy and PyCUDA engines, you can optionally install a version +of the filtered FFT based on cufft and callbacks. Due to the nature of this extension it +needs to be built for fixed array sizes externally and currently supports array +sizes of 16, 32, 64, 128, 256, 512, 1024 and 2048. :: - $ conda activate ptypy_pycuda - (ptypy_pycuda)$ cd cufft - (ptypy_pycuda)$ conda env update --file dependencies.yml --name ptypy_pycuda - (ptypy_pycuda)$ pip install . + $ conda activate ptypy_cupy + (ptypy_cupy)$ cd cufft + (ptypy_cupy)$ conda env update --file dependencies.yml --name ptypy_cupy + (ptypy_cupy)$ pip install . Optional packages diff --git a/ptypy/version.py b/ptypy/version.py index ee3909a1..3abe0dd0 100644 --- a/ptypy/version.py +++ b/ptypy/version.py @@ -1,6 +1,6 @@ -short_version = '0.8.0' -version = '0.8.0' +short_version = '0.8.1' +version = '0.8.1' release = False if not release: From d8b788db2b66bee00f4b0dd735213558f93f1dda Mon Sep 17 00:00:00 2001 From: "Benedikt J. Daurer" Date: Mon, 11 Mar 2024 12:28:34 +0000 Subject: [PATCH 14/14] Adding Python 3.12 to github workflow (#548) * adding Python 3.12 to workflow * added more comments in workflow --- .github/workflows/test.yml | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index c3d61e1a..435c675c 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -24,29 +24,39 @@ jobs: max-parallel: 10 fail-fast: false matrix: - python-version: ['3.8','3.9','3.10', '3.11'] - name: Testing with Python ${{ matrix.python-version }} + python-version: # There is a bug in conda 24.1.2 for Python < 3.12 so we pin the version to 23.11.0 + - python: "3.8" + conda: "23.11.0" + - python: "3.9" + conda: "23.11.0" + - python: "3.10" + conda: "23.11.0" + - python: "3.11" + conda: "23.11.0" + - python: "3.12" + conda: "latest" + name: Testing with Python ${{ matrix.python-version.python }} steps: - name: Checkout uses: actions/checkout@v4 - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version.python }} uses: actions/setup-python@v5 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.python-version.python }} - name: Add conda to system path run: | # $CONDA is an environment variable pointing to the root of the miniconda directory echo $CONDA/bin >> $GITHUB_PATH conda --version - - name: Change conda version + - name: Change conda version if necessary + if: ${{ matrix.python-version.conda != 'latest' }} run: | - # There is a bug in the most recent version (24.1.2) - conda install conda=23.11.0 python=3.11 + conda install conda=${{ matrix.python-version.conda }} python=${{ matrix.python-version.python }} conda --version - name: Install dependencies run: | # replace python version in core dependencies - sed -i 's/python/python=${{ matrix.python-version }}/' dependencies_core.yml + sed -i 's/python/python=${{ matrix.python-version.python }}/' dependencies_core.yml conda env update --file dependencies_core.yml --name base conda list - name: Prepare ptypy