From 56688de26698ce5bcc41d38359f9f3f0a280aca9 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 15 Feb 2024 12:19:52 +0000 Subject: [PATCH 01/26] Add a Poisson solver selection mechanism. --- src/common.f90 | 1 + src/solver.f90 | 48 +++++++++++++++++++++++++++++++++++++++++++++++- src/xcompact.f90 | 2 ++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/common.f90 b/src/common.f90 index 3b46919c..433f985d 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -15,6 +15,7 @@ module m_common real(dp) :: dx, dy, dz integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1 character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e + logical :: use_fft end type globs_t contains diff --git a/src/solver.f90 b/src/solver.f90 index fad0a715..87355326 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -43,6 +43,7 @@ module m_solver class(base_backend_t), pointer :: backend class(dirps_t), pointer :: xdirps, ydirps, zdirps class(time_intg_t), pointer :: time_integrator + procedure(poisson_solver), pointer :: poisson => null() contains procedure :: transeq procedure :: divergence_v2p @@ -51,9 +52,22 @@ module m_solver procedure :: run end type solver_t + abstract interface + subroutine poisson_solver(self, pressure, div_u) + import :: solver_t + import :: field_t + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + end subroutine poisson_solver + end interface + interface solver_t module procedure init end interface solver_t + contains function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & @@ -83,6 +97,14 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & solver%v => solver%backend%allocator%get_block() solver%w => solver%backend%allocator%get_block() + if (globs%use_fft) then + print*, 'Poisson solver: FFT' + solver%poisson => poisson_fft + else + print*, 'Poisson solver: CG' + solver%poisson => poisson_cg + end if + ! Set initial conditions dims(:) = solver%backend%allocator%dims(:) allocate(u_init(dims(1), dims(2), dims(3))) @@ -489,6 +511,30 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) end subroutine curl + subroutine poisson_fft(self, pressure, div_u) + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + + ! call forward FFT + + ! postprocess + + ! call backward FFT + + end subroutine poisson_fft + + subroutine poisson_cg(self, pressure, div_u) + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + + end subroutine poisson_cg + subroutine run(self, n_iter, u_out, v_out, w_out) implicit none @@ -524,7 +570,7 @@ subroutine run(self, n_iter, u_out, v_out, w_out) pressure => self%backend%allocator%get_block() - !call self%poisson(pressure, div_u) + call self%poisson(pressure, div_u) call self%backend%allocator%release_block(div_u) diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 779e09f0..b5d4e1fd 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -62,6 +62,8 @@ program xcompact ! set nprocs based on run time arguments globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 + globs%use_fft = .true. + ! Lets allow a 1D decomposition for the moment !globs%nproc_x = nproc From f037d114f8ed4d8e95dabb34a3cafc9b0c2583ae Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 15 Feb 2024 12:24:02 +0000 Subject: [PATCH 02/26] Implement a dedicated class for the FFT based Poisson solver. --- src/CMakeLists.txt | 3 ++ src/backend.f90 | 2 + src/cuda/backend.f90 | 15 +++++- src/cuda/poisson_fft.f90 | 74 +++++++++++++++++++++++++ src/omp/backend.f90 | 13 ++++- src/omp/poisson_fft.f90 | 57 ++++++++++++++++++++ src/poisson_fft.f90 | 114 +++++++++++++++++++++++++++++++++++++++ src/solver.f90 | 4 ++ src/xcompact.f90 | 4 +- 9 files changed, 282 insertions(+), 4 deletions(-) create mode 100644 src/cuda/poisson_fft.f90 create mode 100644 src/omp/poisson_fft.f90 create mode 100644 src/poisson_fft.f90 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 330a14c1..4f50d39e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,12 +2,14 @@ set(SRC allocator.f90 backend.f90 common.f90 + poisson_fft.f90 solver.f90 tdsops.f90 time_integrator.f90 omp/backend.f90 omp/common.f90 omp/kernels_dist.f90 + omp/poisson_fft.f90 omp/sendrecv.f90 omp/exec_dist.f90 ) @@ -18,6 +20,7 @@ set(CUDASRC cuda/exec_dist.f90 cuda/kernels/distributed.f90 cuda/kernels/reorder.f90 + cuda/poisson_fft.f90 cuda/sendrecv.f90 cuda/tdsops.f90 ) diff --git a/src/backend.f90 b/src/backend.f90 index cf453dad..44b1bf93 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -1,6 +1,7 @@ module m_base_backend use m_allocator, only: allocator_t, field_t use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: tdsops_t, dirps_t implicit none @@ -24,6 +25,7 @@ module m_base_backend integer :: nx_loc, ny_loc, nz_loc class(allocator_t), pointer :: allocator class(dirps_t), pointer :: xdirps, ydirps, zdirps + class(poisson_fft_t), pointer :: poisson_fft contains procedure(transeq_ders), deferred :: transeq_x procedure(transeq_ders), deferred :: transeq_y diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 1097d574..5b97ea8f 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -7,11 +7,13 @@ module m_cuda_backend use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t, & RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t, tdsops_t use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t use m_cuda_common, only: SZ use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact + use m_cuda_poisson_fft, only: cuda_poisson_fft_t use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields use m_cuda_tdsops, only: cuda_tdsops_t use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs @@ -56,13 +58,15 @@ module m_cuda_backend contains - function init(globs, allocator) result(backend) + function init(globs, allocator, xdirps, ydirps, zdirps) result(backend) implicit none class(globs_t) :: globs class(allocator_t), target, intent(inout) :: allocator + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps type(cuda_backend_t) :: backend + type(cuda_poisson_fft_t) :: cuda_poisson_fft integer :: n_halo, n_block select type(allocator) @@ -111,6 +115,15 @@ function init(globs, allocator) result(backend) allocate(backend%d2u_recv_s_dev(SZ, 1, n_block)) allocate(backend%d2u_recv_e_dev(SZ, 1, n_block)) + if (globs%use_fft) then + allocate(cuda_poisson_fft_t :: backend%poisson_fft) + + select type (poisson_fft => backend%poisson_fft) + type is (cuda_poisson_fft_t) + poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps) + end select + end if + end function init subroutine alloc_cuda_tdsops( & diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 new file mode 100644 index 00000000..9b3adb01 --- /dev/null +++ b/src/cuda/poisson_fft.f90 @@ -0,0 +1,74 @@ +module m_cuda_poisson_fft + use cudafor + + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t + + implicit none + + type, extends(poisson_fft_t) :: cuda_poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + complex(dp), device, allocatable, dimension(:, :, :) :: & + c_x_dev, c_y_dev, c_z_dev, waves_dev + complex(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & + ay_dev, by_dev, az_dev, bz_dev + contains + procedure :: fft_forward => fft_forward_cuda + procedure :: fft_backward => fft_backward_cuda + procedure :: fft_postprocess => fft_postprocess_cuda + end type cuda_poisson_fft_t + + interface cuda_poisson_fft_t + module procedure init + end interface cuda_poisson_fft_t + +contains + + function init(xdirps, ydirps, zdirps) result(poisson_fft) + implicit none + + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + type(cuda_poisson_fft_t) :: poisson_fft + integer :: nx, ny, nz + + call poisson_fft%base_init(xdirps, ydirps, zdirps) + + nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz + + allocate (poisson_fft%waves_dev(nx, ny, nz)) + poisson_fft%waves_dev = poisson_fft%waves + + allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx)) + allocate (poisson_fft%ay_dev(ny), poisson_fft%by_dev(ny)) + allocate (poisson_fft%az_dev(nz), poisson_fft%bz_dev(nz)) + poisson_fft%ax_dev = poisson_fft%ax; poisson_fft%bx_dev = poisson_fft%bx + poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by + poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz + + end function init + + subroutine fft_forward_cuda(self, f_in) + implicit none + + class(cuda_poisson_fft_t) :: self + class(field_t), intent(in) :: f_in + end subroutine fft_forward_cuda + + subroutine fft_backward_cuda(self, f_out) + implicit none + + class(cuda_poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out + end subroutine fft_backward_cuda + + subroutine fft_postprocess_cuda(self) + implicit none + + class(cuda_poisson_fft_t) :: self + end subroutine fft_postprocess_cuda + +end module m_cuda_poisson_fft diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index aca287bd..55c2f005 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -7,6 +7,7 @@ module m_omp_backend use m_omp_sendrecv, only: sendrecv_fields use m_omp_common, only: SZ + use m_omp_poisson_fft, only: omp_poisson_fft_t implicit none @@ -42,11 +43,12 @@ module m_omp_backend contains - function init(globs, allocator) result(backend) + function init(globs, allocator, xdirps, ydirps, zdirps) result(backend) implicit none class(globs_t) :: globs class(allocator_t), target, intent(inout) :: allocator + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps type(omp_backend_t) :: backend integer :: n_halo, n_block @@ -86,6 +88,15 @@ function init(globs, allocator) result(backend) allocate(backend%d2u_recv_s(SZ, 1, n_block)) allocate(backend%d2u_recv_e(SZ, 1, n_block)) + if (globs%use_fft) then + allocate(omp_poisson_fft_t :: backend%poisson_fft) + + select type (poisson_fft => backend%poisson_fft) + type is (omp_poisson_fft_t) + poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) + end select + end if + end function init subroutine alloc_omp_tdsops( & diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 new file mode 100644 index 00000000..e561f871 --- /dev/null +++ b/src/omp/poisson_fft.f90 @@ -0,0 +1,57 @@ +module m_omp_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t + + implicit none + + type, extends(poisson_fft_t) :: omp_poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z + contains + procedure :: fft_forward => fft_forward_omp + procedure :: fft_backward => fft_backward_omp + procedure :: fft_postprocess => fft_postprocess_omp + end type omp_poisson_fft_t + + interface omp_poisson_fft_t + module procedure init + end interface omp_poisson_fft_t + +contains + + function init(xdirps, ydirps, zdirps) result(poisson_fft) + implicit none + + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + type(omp_poisson_fft_t) :: poisson_fft + integer :: nx, ny, nz + + call poisson_fft%base_init(xdirps, ydirps, zdirps) + + end function init + + subroutine fft_forward_omp(self, f_in) + implicit none + + class(omp_poisson_fft_t) :: self + class(field_t), intent(in) :: f_in + end subroutine fft_forward_omp + + subroutine fft_backward_omp(self, f_out) + implicit none + + class(omp_poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out + end subroutine fft_backward_omp + + subroutine fft_postprocess_omp(self) + implicit none + + class(omp_poisson_fft_t) :: self + end subroutine fft_postprocess_omp + +end module m_omp_poisson_fft diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 new file mode 100644 index 00000000..8f9e6e0e --- /dev/null +++ b/src/poisson_fft.f90 @@ -0,0 +1,114 @@ +module m_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp, pi + use m_tdsops, only: dirps_t + + implicit none + + type, abstract :: poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + integer :: nx, ny, nz + complex(dp), allocatable, dimension(:, :, :) :: waves + complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz + contains + procedure(fft_forward), deferred :: fft_forward + procedure(fft_backward), deferred :: fft_backward + procedure(fft_postprocess), deferred :: fft_postprocess + procedure :: base_init + procedure :: waves_set + end type poisson_fft_t + + abstract interface + subroutine fft_forward(self, f_in) + import :: poisson_fft_t + import :: field_t + implicit none + + class(poisson_fft_t) :: self + class(field_t), intent(in) :: f_in + end subroutine fft_forward + + subroutine fft_backward(self, f_out) + import :: poisson_fft_t + import :: field_t + implicit none + + class(poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out + end subroutine fft_backward + + subroutine fft_postprocess(self) + import :: poisson_fft_t + implicit none + + class(poisson_fft_t) :: self + end subroutine fft_postprocess + end interface + +contains + + subroutine base_init(self, xdirps, ydirps, zdirps) + implicit none + + class(poisson_fft_t) :: self + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + integer :: nx, ny, nz + + self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n + + allocate (self%ax(self%nx), self%bx(self%nx)) + allocate (self%ay(self%nx), self%by(self%nx)) + allocate (self%az(self%nx), self%bz(self%nx)) + + allocate (self%waves(self%nx, self%ny, self%nz)) + + ! waves_set requires some of the preprocessed tdsops variables. + call self%waves_set(xdirps, ydirps, zdirps) + + end subroutine base_init + + subroutine waves_set(self, xdirps, ydirps, zdirps) + implicit none + + class(poisson_fft_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, & + exs, eys, ezs + + integer :: nx, ny, nz + real(dp) :: w, wp, rlexs, rleys, rlezs, & + xtt, ytt, ztt, xtt1, ytt1, ztt1, xt1, yt1, zt1 + complex(dp) :: xt2, yt2, zt2, xyzk + + integer :: i, j, k, ka, kb, ix, iy, iz + + + nx = xdirps%n; ny = ydirps%n; nz = zdirps%n + + do i = 1, nx + self%ax(i) = sin((i-1)*pi/nx) + self%bx(i) = cos((i-1)*pi/nx) + end do + + do i = 1, ny + self%ay(i) = sin((i-1)*pi/ny) + self%by(i) = cos((i-1)*pi/ny) + end do + + do i = 1, nz + self%az(i) = sin((i-1)*pi/nz) + self%bz(i) = cos((i-1)*pi/nz) + end do + + ! Now kxyz + allocate(xkx(nx), xk2(nx), exs(nx)) + allocate(yky(ny), yk2(ny), eys(ny)) + allocate(zkz(nz), zk2(nz), ezs(nz)) + !xkx(:) = 0; xk2(:) = 0; yky(:) = 0; yk2(:) = 0; zkz(:) = 0; zk2(:) = 0 + + end subroutine waves_set + +end module m_poisson_fft diff --git a/src/solver.f90 b/src/solver.f90 index 87355326..a0ccc37f 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -519,10 +519,14 @@ subroutine poisson_fft(self, pressure, div_u) class(field_t), intent(in) :: div_u ! call forward FFT + ! output array in spectral space is stored at poisson_fft class + call self%backend%poisson_fft%fft_forward(div_u) ! postprocess + call self%backend%poisson_fft%fft_postprocess ! call backward FFT + call self%backend%poisson_fft%fft_backward(pressure) end subroutine poisson_fft diff --git a/src/xcompact.f90 b/src/xcompact.f90 index b5d4e1fd..2474b45d 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -106,7 +106,7 @@ program xcompact allocator => cuda_allocator print*, 'CUDA allocator instantiated' - cuda_backend = cuda_backend_t(globs, allocator) + cuda_backend = cuda_backend_t(globs, allocator, xdirps, ydirps, zdirps) backend => cuda_backend print*, 'CUDA backend instantiated' #else @@ -114,7 +114,7 @@ program xcompact allocator => omp_allocator print*, 'OpenMP allocator instantiated' - omp_backend = omp_backend_t(globs, allocator) + omp_backend = omp_backend_t(globs, allocator, xdirps, ydirps, zdirps) backend => omp_backend print*, 'OpenMP backend instantiated' #endif From 2fdb1b046ab279de9168d1a03befa828acabf532 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 15 Feb 2024 13:12:37 +0000 Subject: [PATCH 03/26] Store compact scheme coefficients in tdsops class. --- src/tdsops.f90 | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/tdsops.f90 b/src/tdsops.f90 index c159cdd2..e0140d6e 100644 --- a/src/tdsops.f90 +++ b/src/tdsops.f90 @@ -27,6 +27,7 @@ module m_tdsops dist_sa, dist_sc, & !! back subs. dist_af !! the auxiliary factors real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :) + real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp integer :: n, n_halo contains procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st, preprocess @@ -154,6 +155,9 @@ subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = afi; self%b = bfi + self%coeffs(:) = [0._dp, 0._dp, -bfi, -afi, & 0._dp, & afi, bfi, 0._dp, 0._dp] @@ -321,6 +325,9 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = asi; self%b = bsi; self%c = csi; self%d = dsi + self%coeffs(:) = [dsi, csi, bsi, asi, & -2._dp*(asi + bsi + csi + dsi), & asi, bsi, csi, dsi] @@ -511,6 +518,9 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = aici; self%b = bici; self%c = cici; self%d = dici + select case (from_to) case ('v2p') self%coeffs(:) = [0._dp, dici, cici, bici, & @@ -633,6 +643,9 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = aci; self%b = bci + select case (from_to) case ('v2p') self%coeffs(:) = [0._dp, 0._dp, 0._dp, -bci, & From a306ee499b8c0de93fec80994715c63b685e126a Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 19 Feb 2024 11:41:52 +0000 Subject: [PATCH 04/26] Store domain length and dx in dirps_t. --- src/tdsops.f90 | 1 + src/xcompact.f90 | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/tdsops.f90 b/src/tdsops.f90 index e0140d6e..3db7b4d9 100644 --- a/src/tdsops.f90 +++ b/src/tdsops.f90 @@ -47,6 +47,7 @@ module m_tdsops stagder_v2p, stagder_p2v, & interpl_v2p, interpl_p2v integer :: nrank, nproc, pnext, pprev, n, n_blocks + real(dp) :: L, d end type dirps_t contains diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 2474b45d..d407938c 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -55,6 +55,7 @@ program xcompact ! read L_x/y/z from the input file globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi + xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz ! read ns from the input file globs%nx = 512; globs%ny = 512; globs%nz = 512 @@ -93,6 +94,8 @@ program xcompact globs%dy = globs%Ly/globs%ny globs%dz = globs%Lz/globs%nz + xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz + xdirps%n = globs%nx_loc ydirps%n = globs%ny_loc zdirps%n = globs%nz_loc From c7ff92a373fd5f96eb801bf7a5e530393ad1e4af Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 19 Feb 2024 11:43:57 +0000 Subject: [PATCH 05/26] Change the way we initialise poisson_fft class. --- src/backend.f90 | 12 ++++++++++++ src/cuda/backend.f90 | 25 ++++++++++++++++--------- src/omp/backend.f90 | 25 ++++++++++++++++--------- src/solver.f90 | 17 +++++++++-------- 4 files changed, 53 insertions(+), 26 deletions(-) diff --git a/src/backend.f90 b/src/backend.f90 index 44b1bf93..e17970e5 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -39,6 +39,7 @@ module m_base_backend procedure(get_field), deferred :: get_field procedure(set_field), deferred :: set_field procedure(alloc_tdsops), deferred :: alloc_tdsops + procedure(init_poisson_fft), deferred :: init_poisson_fft end type base_backend_t abstract interface @@ -190,4 +191,15 @@ subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme, n_halo, & end subroutine alloc_tdsops end interface + abstract interface + subroutine init_poisson_fft(self, xdirps, ydirps, zdirps) + import :: base_backend_t + import :: dirps_t + implicit none + + class(base_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + end subroutine init_poisson_fft + end interface + end module m_base_backend diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 5b97ea8f..741594ec 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -47,6 +47,7 @@ module m_cuda_backend procedure :: scalar_product => scalar_product_cuda procedure :: set_field => set_field_cuda procedure :: get_field => get_field_cuda + procedure :: init_poisson_fft => init_cuda_poisson_fft procedure :: transeq_cuda_dist procedure :: transeq_cuda_thom procedure :: tds_solve_dist @@ -115,15 +116,6 @@ function init(globs, allocator, xdirps, ydirps, zdirps) result(backend) allocate(backend%d2u_recv_s_dev(SZ, 1, n_block)) allocate(backend%d2u_recv_e_dev(SZ, 1, n_block)) - if (globs%use_fft) then - allocate(cuda_poisson_fft_t :: backend%poisson_fft) - - select type (poisson_fft => backend%poisson_fft) - type is (cuda_poisson_fft_t) - poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps) - end select - end if - end function init subroutine alloc_cuda_tdsops( & @@ -613,5 +605,20 @@ subroutine get_field_cuda(self, arr, f) end subroutine get_field_cuda + subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps) + implicit none + + class(cuda_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + allocate(cuda_poisson_fft_t :: self%poisson_fft) + + select type (poisson_fft => self%poisson_fft) + type is (cuda_poisson_fft_t) + poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps) + end select + + end subroutine init_cuda_poisson_fft + end module m_cuda_backend diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 55c2f005..e1863b5a 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -34,6 +34,7 @@ module m_omp_backend procedure :: scalar_product => scalar_product_omp procedure :: set_field => set_field_omp procedure :: get_field => get_field_omp + procedure :: init_poisson_fft => init_omp_poisson_fft procedure :: transeq_omp_dist end type omp_backend_t @@ -88,15 +89,6 @@ function init(globs, allocator, xdirps, ydirps, zdirps) result(backend) allocate(backend%d2u_recv_s(SZ, 1, n_block)) allocate(backend%d2u_recv_e(SZ, 1, n_block)) - if (globs%use_fft) then - allocate(omp_poisson_fft_t :: backend%poisson_fft) - - select type (poisson_fft => backend%poisson_fft) - type is (omp_poisson_fft_t) - poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) - end select - end if - end function init subroutine alloc_omp_tdsops( & @@ -373,5 +365,20 @@ subroutine get_field_omp(self, arr, f) end subroutine get_field_omp + subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) + implicit none + + class(omp_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + allocate(omp_poisson_fft_t :: self%poisson_fft) + + select type (poisson_fft => self%poisson_fft) + type is (omp_poisson_fft_t) + poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) + end select + + end subroutine init_omp_poisson_fft + end module m_omp_backend diff --git a/src/solver.f90 b/src/solver.f90 index a0ccc37f..80fc5326 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -97,14 +97,6 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & solver%v => solver%backend%allocator%get_block() solver%w => solver%backend%allocator%get_block() - if (globs%use_fft) then - print*, 'Poisson solver: FFT' - solver%poisson => poisson_fft - else - print*, 'Poisson solver: CG' - solver%poisson => poisson_cg - end if - ! Set initial conditions dims(:) = solver%backend%allocator%dims(:) allocate(u_init(dims(1), dims(2), dims(3))) @@ -130,6 +122,15 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & call allocate_tdsops(solver%ydirps, ny, dy, solver%backend) call allocate_tdsops(solver%zdirps, nz, dz, solver%backend) + if (globs%use_fft) then + print*, 'Poisson solver: FFT' + call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps) + solver%poisson => poisson_fft + else + print*, 'Poisson solver: CG' + solver%poisson => poisson_cg + end if + end function init subroutine allocate_tdsops(dirps, nx, dx, backend) From fc3810f23ad091dc17ba7c8769057d6106cccef0 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 19 Feb 2024 11:45:28 +0000 Subject: [PATCH 06/26] Construct the waves array for all periodic BCs. --- src/cuda/poisson_fft.f90 | 5 +- src/omp/poisson_fft.f90 | 4 +- src/poisson_fft.f90 | 105 ++++++++++++++++++++++++++++++++++++--- 3 files changed, 105 insertions(+), 9 deletions(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 9b3adb01..600081ea 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -6,6 +6,8 @@ module m_cuda_poisson_fft use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t + use m_cuda_common, only: SZ + implicit none type, extends(poisson_fft_t) :: cuda_poisson_fft_t @@ -33,9 +35,10 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) class(dirps_t), intent(in) :: xdirps, ydirps, zdirps type(cuda_poisson_fft_t) :: poisson_fft + integer :: nx, ny, nz - call poisson_fft%base_init(xdirps, ydirps, zdirps) + call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 index e561f871..f34a1957 100644 --- a/src/omp/poisson_fft.f90 +++ b/src/omp/poisson_fft.f90 @@ -4,6 +4,8 @@ module m_omp_poisson_fft use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t + use m_omp_common, only: SZ + implicit none type, extends(poisson_fft_t) :: omp_poisson_fft_t @@ -30,7 +32,7 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) type(omp_poisson_fft_t) :: poisson_fft integer :: nx, ny, nz - call poisson_fft%base_init(xdirps, ydirps, zdirps) + call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) end function init diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 index 8f9e6e0e..9dc2a892 100644 --- a/src/poisson_fft.f90 +++ b/src/poisson_fft.f90 @@ -48,11 +48,12 @@ end subroutine fft_postprocess contains - subroutine base_init(self, xdirps, ydirps, zdirps) + subroutine base_init(self, xdirps, ydirps, zdirps, sz) implicit none class(poisson_fft_t) :: self class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + integer, intent(in) :: sz integer :: nx, ny, nz @@ -62,25 +63,25 @@ subroutine base_init(self, xdirps, ydirps, zdirps) allocate (self%ay(self%nx), self%by(self%nx)) allocate (self%az(self%nx), self%bz(self%nx)) - allocate (self%waves(self%nx, self%ny, self%nz)) + allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz)) ! waves_set requires some of the preprocessed tdsops variables. - call self%waves_set(xdirps, ydirps, zdirps) + call self%waves_set(xdirps, ydirps, zdirps, sz) end subroutine base_init - subroutine waves_set(self, xdirps, ydirps, zdirps) + subroutine waves_set(self, xdirps, ydirps, zdirps, sz) implicit none class(poisson_fft_t) :: self type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + integer, intent(in) :: sz complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, & exs, eys, ezs integer :: nx, ny, nz - real(dp) :: w, wp, rlexs, rleys, rlezs, & - xtt, ytt, ztt, xtt1, ytt1, ztt1, xt1, yt1, zt1 + real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1 complex(dp) :: xt2, yt2, zt2, xyzk integer :: i, j, k, ka, kb, ix, iy, iz @@ -107,7 +108,97 @@ subroutine waves_set(self, xdirps, ydirps, zdirps) allocate(xkx(nx), xk2(nx), exs(nx)) allocate(yky(ny), yk2(ny), eys(ny)) allocate(zkz(nz), zk2(nz), ezs(nz)) - !xkx(:) = 0; xk2(:) = 0; yky(:) = 0; yk2(:) = 0; zkz(:) = 0; zk2(:) = 0 + xkx(:) = 0; xk2(:) = 0; yky(:) = 0; yk2(:) = 0; zkz(:) = 0; zk2(:) = 0 + + ! periodic-x + do i = 1, nx/2 + 1 + w = 2*pi*(i - 1)/nx + wp = xdirps%stagder_v2p%a*2*xdirps%d*sin(0.5_dp*w) & + + xdirps%stagder_v2p%b*2*xdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*xdirps%stagder_v2p%alpha*cos(w)) + + xkx(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L) + exs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*w/xdirps%L) + xk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)**2 + end do + do i = nx/2 + 2, nx + xkx(i) = xkx(nx - i + 2) + exs(i) = exs(nx - i + 2) + xk2(i) = xk2(nx - i + 2) + end do + + ! periodic-y + do i = 1, ny/2 + 1 + w = 2*pi*(i - 1)/ny + wp = ydirps%stagder_v2p%a*2*ydirps%d*sin(0.5_dp*w) & + + ydirps%stagder_v2p%b*2*ydirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*ydirps%stagder_v2p%alpha*cos(w)) + + yky(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L) + eys(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*w/ydirps%L) + yk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)**2 + end do + do i = ny/2 + 2, ny + yky(i) = yky(ny-i+2) + eys(i) = eys(ny-i+2) + yk2(i) = yk2(ny-i+2) + end do + + ! periodic-z + do i = 1, nz/2 + 1 + w = 2*pi*(i - 1)/nz + wp = zdirps%stagder_v2p%a*2*zdirps%d*sin(0.5_dp*w) & + + zdirps%stagder_v2p%b*2*zdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*zdirps%stagder_v2p%alpha*cos(w)) + + zkz(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L) + ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L) + zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 + end do + + !do k = !sp%xst(3), sp%xen(3) + !do j = sp%xst(2), sp%xen(2) + ! kxyz array is in z pencil orientation, with nz=nz/2+1 !!nooo + ! kxyz array is in x pencil orientation. + !do k = 1, nx*ny/SZ + print*, 'set kxyz array' + ! TODO: do loop ranges below are valid only for single rank runs + do ka = 1, nz/2 + 1 + do kb = 1, ny/sz + do j = 1, nx + do i = 1, sz + ix = j; iy = (kb-1)*sz+i; iz = ka !+ xderps%nspz_st - 1 + rlexs = real(exs(ix), kind=dp)*xdirps%d + rleys = real(eys(iy), kind=dp)*ydirps%d + rlezs = real(ezs(iz), kind=dp)*zdirps%d + + xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) & + + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) & + + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) & + + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp)) + ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) & + + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) & + + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) & + + ydirps%interpl_v2p%d*cos(rleys*3.5_dp)) + ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) & + + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) & + + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) & + + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp)) + + xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs) + yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys) + zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs) + + xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2) + yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2) + zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2) + + xyzk = xt2 + yt2 + zt2 + self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk + end do + end do + end do + end do end subroutine waves_set From 991bb92dab2ac07a4113754adc10d95da6cd5a2d Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 19 Feb 2024 14:20:54 +0000 Subject: [PATCH 07/26] Create cufft plans. --- src/CMakeLists.txt | 1 + src/cuda/poisson_fft.f90 | 36 +++++++++++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4f50d39e..2b2ad294 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -43,6 +43,7 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp") set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast") target_link_options(x3d2 INTERFACE "-cuda") + target_link_options(x3d2 INTERFACE "-lcufft") target_compile_options(xcompact PRIVATE "-DCUDA") # target_link_options(xcompact INTERFACE "-cuda") diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 600081ea..0b2edb1f 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -1,5 +1,6 @@ module m_cuda_poisson_fft use cudafor + use cufft use m_allocator, only: field_t use m_common, only: dp @@ -36,11 +37,15 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) type(cuda_poisson_fft_t) :: poisson_fft - integer :: nx, ny, nz + integer :: nx, ny, nz, nx_loc, ny_loc, nz_loc + + integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy, ierrfft + integer(int_ptr_kind()) :: worksize call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz + nx_loc = nx; ny_loc = ny; nz_loc = nz allocate (poisson_fft%waves_dev(nx, ny, nz)) poisson_fft%waves_dev = poisson_fft%waves @@ -52,6 +57,35 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz + allocate (poisson_fft%c_x_dev(nx, SZ, (ny*(nz/2 + 1))/SZ)) + allocate (poisson_fft%c_y_dev(ny, SZ, (nx*(nz/2 + 1))/SZ)) + allocate (poisson_fft%c_z_dev(nz/2 + 1, SZ, nx*ny/SZ)) + + ! plans for regular for loop executions in a single stream + ierrfft = cufftCreate(planD2Zz) + ierrfft = cufftMakePlanMany(planD2Zz, 1, nz, & + nz, 1, nz, nz/2+1, 1, nz/2+1, & + CUFFT_D2Z, nx*ny, worksize) + ierrfft = cufftSetWorkArea(planD2Zz, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(planZ2Dz) + ierrfft = cufftMakePlanMany(planZ2Dz, 1, nz, & + nz/2+1, 1, nz/2+1, nz, 1, nz, & + CUFFT_Z2D, nx*ny, worksize) + ierrfft = cufftSetWorkArea(planZ2Dz, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(planZ2Zy) + ierrfft = cufftMakePlanMany(planZ2Zy, 1, ny, & + ny, 1, ny, ny, 1, ny, & + CUFFT_Z2Z, nx*(nz/2 + 1), worksize) + ierrfft = cufftSetWorkArea(planZ2Zy, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(planZ2Zx) + ierrfft = cufftMakePlanMany(planZ2Zx, 1, nx, & + nx, 1, nx, nx, 1, nx, & + CUFFT_Z2Z, ny*(nz/2 + 1), worksize) + ierrfft = cufftSetWorkArea(planZ2Zx, poisson_fft%c_y_dev) + end function init subroutine fft_forward_cuda(self, f_in) From 2756ee701b5df12a9e710072dd29c021fe08152b Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 19 Feb 2024 14:47:23 +0000 Subject: [PATCH 08/26] Add postprocessing in spectral space. --- src/CMakeLists.txt | 1 + src/cuda/kernels/complex.f90 | 99 ++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 src/cuda/kernels/complex.f90 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 2b2ad294..b2116433 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,6 +19,7 @@ set(CUDASRC cuda/common.f90 cuda/exec_dist.f90 cuda/kernels/distributed.f90 + cuda/kernels/complex.f90 cuda/kernels/reorder.f90 cuda/poisson_fft.f90 cuda/sendrecv.f90 diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 new file mode 100644 index 00000000..2584986e --- /dev/null +++ b/src/cuda/kernels/complex.f90 @@ -0,0 +1,99 @@ +module m_cuda_complex + use cudafor + + use m_common, only: dp + use m_cuda_common, only: SZ + + implicit none + +contains + + attributes(global) subroutine processfftdiv( & + div, waves, nx, ny, nz, ax, bx, ay, by, az, bz & + ) + implicit none + + ! Arguments + complex(dp), device, intent(inout), dimension(:, :, :) :: div + complex(dp), device, intent(in), dimension(:, :, :) :: waves + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz + integer, value, intent(in) :: nx, ny, nz + + ! Local variables + integer :: i, j, b, ix, iy, iz + real(dp) :: tmp_r, tmp_c, div_r, div_c + + i = threadIdx%x + b = blockIdx%x + + do j = 1, nx + ! normalisation + div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz) + div_c = aimag(div(i, j, b))/(nx*ny*nz) + + ! get the indices for x, y, z directions + ix = j; iy = i + (b-1)/(nz/2+1)*SZ; iz = mod(b-1, nz/2+1) + 1 + + ! post-process forward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) + tmp_c*az(iz) + div_c = tmp_c*bz(iz) - tmp_r*az(iz) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if ( iy > ny/2 + 1 ) div_r = -div_r + if ( iy > ny/2 + 1 ) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c = tmp_c*bx(ix) - tmp_r*ax(ix) + if ( ix > nx/2 + 1 ) div_r = -div_r + if ( ix > nx/2 + 1 ) div_c = -div_c + + ! Solve Poisson + tmp_r = real(waves(i, j, b), kind=dp) + tmp_c = aimag(waves(i, j, b)) + if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then + div_r = 0._dp; div_c = 0._dp + else + div_r =-div_r/tmp_r + div_c =-div_c/tmp_c + end if + + ! post-process backward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) - tmp_c*az(iz) + div_c =-tmp_c*bz(iz) - tmp_r*az(iz) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if ( iy > ny/2 + 1 ) div_r = -div_r + if ( iy > ny/2 + 1 ) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c =-tmp_c*bx(ix) + tmp_r*ax(ix) + if ( ix > nx/2 + 1 ) div_r = -div_r + if ( ix > nx/2 + 1 ) div_c = -div_c + + ! update the entry + div(i, j, b) = cmplx(div_r, div_c, kind=dp) + end do + + end subroutine processfftdiv + +end module m_cuda_complex From d5c159ad068cb33f6a00a415fc65fe8f47cdf36f Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Tue, 20 Feb 2024 17:11:20 +0000 Subject: [PATCH 09/26] Store plans at poisson_fft class level. --- src/cuda/poisson_fft.f90 | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 0b2edb1f..f12ec6ac 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -18,6 +18,7 @@ module m_cuda_poisson_fft c_x_dev, c_y_dev, c_z_dev, waves_dev complex(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & ay_dev, by_dev, az_dev, bz_dev + integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy contains procedure :: fft_forward => fft_forward_cuda procedure :: fft_backward => fft_backward_cuda @@ -39,7 +40,7 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) integer :: nx, ny, nz, nx_loc, ny_loc, nz_loc - integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy, ierrfft + integer :: ierrfft integer(int_ptr_kind()) :: worksize call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) @@ -62,29 +63,29 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) allocate (poisson_fft%c_z_dev(nz/2 + 1, SZ, nx*ny/SZ)) ! plans for regular for loop executions in a single stream - ierrfft = cufftCreate(planD2Zz) - ierrfft = cufftMakePlanMany(planD2Zz, 1, nz, & + ierrfft = cufftCreate(poisson_fft%planD2Zz) + ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, & nz, 1, nz, nz/2+1, 1, nz/2+1, & CUFFT_D2Z, nx*ny, worksize) - ierrfft = cufftSetWorkArea(planD2Zz, poisson_fft%c_x_dev) + ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev) - ierrfft = cufftCreate(planZ2Dz) - ierrfft = cufftMakePlanMany(planZ2Dz, 1, nz, & + ierrfft = cufftCreate(poisson_fft%planZ2Dz) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, & nz/2+1, 1, nz/2+1, nz, 1, nz, & CUFFT_Z2D, nx*ny, worksize) - ierrfft = cufftSetWorkArea(planZ2Dz, poisson_fft%c_x_dev) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev) - ierrfft = cufftCreate(planZ2Zy) - ierrfft = cufftMakePlanMany(planZ2Zy, 1, ny, & + ierrfft = cufftCreate(poisson_fft%planZ2Zy) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, & ny, 1, ny, ny, 1, ny, & CUFFT_Z2Z, nx*(nz/2 + 1), worksize) - ierrfft = cufftSetWorkArea(planZ2Zy, poisson_fft%c_x_dev) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev) - ierrfft = cufftCreate(planZ2Zx) - ierrfft = cufftMakePlanMany(planZ2Zx, 1, nx, & + ierrfft = cufftCreate(poisson_fft%planZ2Zx) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, & nx, 1, nx, nx, 1, nx, & CUFFT_Z2Z, ny*(nz/2 + 1), worksize) - ierrfft = cufftSetWorkArea(planZ2Zx, poisson_fft%c_y_dev) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev) end function init From 2f4ad61d6dddab60c3a4fe4831313298f75f7b1f Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 21 Feb 2024 11:05:14 +0000 Subject: [PATCH 10/26] Add forward FFT transforms. --- src/cuda/poisson_fft.f90 | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index f12ec6ac..30df86d2 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -7,6 +7,7 @@ module m_cuda_poisson_fft use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t + use m_cuda_allocator, only: cuda_field_t use m_cuda_common, only: SZ implicit none @@ -89,11 +90,34 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) end function init - subroutine fft_forward_cuda(self, f_in) + subroutine fft_forward_cuda(self, f) implicit none class(cuda_poisson_fft_t) :: self - class(field_t), intent(in) :: f_in + class(field_t), intent(in) :: f + + real(dp), device, pointer, dimension(:, :, :) :: f_dev + integer :: ierrfft + + select type(f); type is (cuda_field_t); f_dev => f%data_d; end select + + ! First reorder f into cartesian-like data structure + + ! Forward FFT transform in z from real to complex + ierrfft = cufftExecD2Z(self%planD2Zz, f_dev, self%c_z_dev) + + ! Reorder from z to y + + ! In-place forward FFT in y + ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & + CUFFT_FORWARD) + + ! Reorder from y to x + + ! In-place forward FFT in x + ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & + CUFFT_FORWARD) + end subroutine fft_forward_cuda subroutine fft_backward_cuda(self, f_out) From 3d463126fa02a90d120bd2f5754a4254ea6d3d38 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 21 Feb 2024 12:03:30 +0000 Subject: [PATCH 11/26] Add backward FFT transforms. --- src/cuda/poisson_fft.f90 | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 30df86d2..6fade43a 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -120,11 +120,34 @@ subroutine fft_forward_cuda(self, f) end subroutine fft_forward_cuda - subroutine fft_backward_cuda(self, f_out) + subroutine fft_backward_cuda(self, f) implicit none class(cuda_poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out + class(field_t), intent(inout) :: f + + real(dp), device, pointer, dimension(:, :, :) :: f_dev + integer :: ierrfft + + select type(f); type is (cuda_field_t); f_dev => f%data_d; end select + + ! In-place backward FFT in x + ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & + CUFFT_INVERSE) + + ! Reorder from x to y + + ! In-place backward FFT in y + ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & + CUFFT_INVERSE) + + ! Reorder from y to z + + ! Backward FFT transform in z from complex to real + ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, f_dev) + + ! Finally reorder f back into our specialist data structure + end subroutine fft_backward_cuda subroutine fft_postprocess_cuda(self) From a7c25d92f217a2bddf270e82a54e00ceb02c9129 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 21 Feb 2024 12:34:28 +0000 Subject: [PATCH 12/26] Fix allocation size of the waves array on GPU. --- src/cuda/poisson_fft.f90 | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 6fade43a..93500f82 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -39,7 +39,7 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) type(cuda_poisson_fft_t) :: poisson_fft - integer :: nx, ny, nz, nx_loc, ny_loc, nz_loc + integer :: nx, ny, nz integer :: ierrfft integer(int_ptr_kind()) :: worksize @@ -47,9 +47,8 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz - nx_loc = nx; ny_loc = ny; nz_loc = nz - allocate (poisson_fft%waves_dev(nx, ny, nz)) + allocate (poisson_fft%waves_dev(SZ, nx, (ny*(nz/2 + 1))/SZ)) poisson_fft%waves_dev = poisson_fft%waves allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx)) @@ -63,7 +62,7 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) allocate (poisson_fft%c_y_dev(ny, SZ, (nx*(nz/2 + 1))/SZ)) allocate (poisson_fft%c_z_dev(nz/2 + 1, SZ, nx*ny/SZ)) - ! plans for regular for loop executions in a single stream + ! set cufft plans ierrfft = cufftCreate(poisson_fft%planD2Zz) ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, & nz, 1, nz, nz/2+1, 1, nz/2+1, & From 95c1b360ee5385df2e0e8025b092dd6b1ecd1776 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 21 Feb 2024 14:11:38 +0000 Subject: [PATCH 13/26] Change shape of the c_x/y/z arrays. --- src/cuda/poisson_fft.f90 | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 93500f82..8a741a64 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -15,10 +15,12 @@ module m_cuda_poisson_fft type, extends(poisson_fft_t) :: cuda_poisson_fft_t !! FFT based Poisson solver !! It can only handle 1D decompositions along z direction. - complex(dp), device, allocatable, dimension(:, :, :) :: & - c_x_dev, c_y_dev, c_z_dev, waves_dev - complex(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & + complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev + complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev + + real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & ay_dev, by_dev, az_dev, bz_dev + integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy contains procedure :: fft_forward => fft_forward_cuda @@ -58,9 +60,9 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz - allocate (poisson_fft%c_x_dev(nx, SZ, (ny*(nz/2 + 1))/SZ)) - allocate (poisson_fft%c_y_dev(ny, SZ, (nx*(nz/2 + 1))/SZ)) - allocate (poisson_fft%c_z_dev(nz/2 + 1, SZ, nx*ny/SZ)) + allocate (poisson_fft%c_x_dev(nx*ny*(nz/2 + 1))) + allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1))) + allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1))) ! set cufft plans ierrfft = cufftCreate(poisson_fft%planD2Zz) From 0c5d456e49d33934b8a8461df18848894c0e4f9c Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 21 Feb 2024 14:34:42 +0000 Subject: [PATCH 14/26] Add postprocessing kernel call. --- src/cuda/poisson_fft.f90 | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 8a741a64..63f97e4c 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -9,6 +9,7 @@ module m_cuda_poisson_fft use m_cuda_allocator, only: cuda_field_t use m_cuda_common, only: SZ + use m_cuda_complex, only: processfftdiv implicit none @@ -155,6 +156,25 @@ subroutine fft_postprocess_cuda(self) implicit none class(cuda_poisson_fft_t) :: self + + complex(dp), device, dimension(:, :, :), pointer :: c_dev + type(dim3) :: blocks, threads + + ! Reshape from cartesian-like to our specialist data structure + + blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1 , 1) + threads = dim3(SZ, 1, 1) + + c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev + + call processfftdiv<<>>( & + c_dev, self%waves_dev, self%nx, self%ny, self%nz, & + self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & + self%az_dev, self%bz_dev & + ) + + ! Reshape from our specialist data structure to cartesian-like + end subroutine fft_postprocess_cuda end module m_cuda_poisson_fft From aac5af21898c2fd7a973ac0295db27a34029ee1c Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 22 Feb 2024 14:32:35 +0000 Subject: [PATCH 15/26] Add complex reordering functions. --- src/cuda/kernels/complex.f90 | 115 +++++++++++++++++++++++++++++++++++ 1 file changed, 115 insertions(+) diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 index 2584986e..cfbf380a 100644 --- a/src/cuda/kernels/complex.f90 +++ b/src/cuda/kernels/complex.f90 @@ -96,4 +96,119 @@ attributes(global) subroutine processfftdiv( & end subroutine processfftdiv + attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_y + complex(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j-1)) + + call syncthreads() + + ! copy into output array from shared + u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i) + + end subroutine reorder_cmplx_x2y_T + + attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_x + complex(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1)) + + call syncthreads() + + ! copy into output array from shared + u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i) + + end subroutine reorder_cmplx_y2x_T + + attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_z + complex(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nx, nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z + + i = threadIdx%x + j = threadIdx%y + k = threadIdx%z + + b_x = blockIdx%z + b_y = blockIdx%y + b_z = blockIdx%x + + ! copy into shared + if ( j + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & + j + (b_z - 1)*SZ + ((b_x-1)/SZ)*nz) + + call syncthreads() + + ! copy into output array from shared + if ( i + (b_z - 1)*SZ <= nz ) & + u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i) + + end subroutine reorder_cmplx_y2z_T + + attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_y + complex(dp), device, intent(in), dimension(:, :, :) :: u_z + integer, value, intent(in) :: nx, nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z + + i = threadIdx%x + j = threadIdx%y + k = threadIdx%z + b_i = blockIdx%x + b_j = blockIdx%y + b_k = blockIdx%z + + b_x = blockIdx%z + b_y = blockIdx%y + b_z = blockIdx%x + + ! copy into shared + if ( i + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) + + call syncthreads() + + ! copy into output array from shared + if ( j + (b_z - 1)*SZ <= nz ) & + u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ)+1, & + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i) + + end subroutine reorder_cmplx_z2y_T + end module m_cuda_complex From 279666dc921bc48676bc0423a899717f27b85e59 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 22 Feb 2024 16:44:57 +0000 Subject: [PATCH 16/26] Call complex reordering functions in forward and backward FFTs. --- src/cuda/poisson_fft.f90 | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 63f97e4c..7c7b6ed1 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -9,7 +9,9 @@ module m_cuda_poisson_fft use m_cuda_allocator, only: cuda_field_t use m_cuda_common, only: SZ - use m_cuda_complex, only: processfftdiv + use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, & + reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, & + processfftdiv implicit none @@ -99,6 +101,10 @@ subroutine fft_forward_cuda(self, f) class(field_t), intent(in) :: f real(dp), device, pointer, dimension(:, :, :) :: f_dev + complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & + c_z_ptr + + type(dim3) :: blocks, threads integer :: ierrfft select type(f); type is (cuda_field_t); f_dev => f%data_d; end select @@ -109,12 +115,26 @@ subroutine fft_forward_cuda(self, f) ierrfft = cufftExecD2Z(self%planD2Zz, f_dev, self%c_z_dev) ! Reorder from z to y + blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) + threads = dim3(SZ, SZ, 1) + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev + + call reorder_cmplx_z2y_T<<>>(c_y_ptr, c_z_ptr, & + self%nx, self%nz/2 + 1) ! In-place forward FFT in y ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & CUFFT_FORWARD) ! Reorder from y to x + blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) + threads = dim3(SZ, SZ, 1) + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + + call reorder_cmplx_y2x_T<<>>(c_x_ptr, c_y_ptr, & + self%nz/2 + 1) ! In-place forward FFT in x ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & @@ -129,6 +149,10 @@ subroutine fft_backward_cuda(self, f) class(field_t), intent(inout) :: f real(dp), device, pointer, dimension(:, :, :) :: f_dev + complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & + c_z_ptr + + type(dim3) :: blocks, threads integer :: ierrfft select type(f); type is (cuda_field_t); f_dev => f%data_d; end select @@ -138,12 +162,26 @@ subroutine fft_backward_cuda(self, f) CUFFT_INVERSE) ! Reorder from x to y + blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) + threads = dim3(SZ, SZ, 1) + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + + call reorder_cmplx_x2y_T<<>>(c_y_ptr, c_x_ptr, & + self%nz/2 + 1) ! In-place backward FFT in y ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & CUFFT_INVERSE) ! Reorder from y to z + blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) + threads = dim3(SZ, SZ, 1) + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev + + call reorder_cmplx_y2z_T<<>>(c_z_ptr, c_y_ptr, & + self%nx, self%nz/2 + 1) ! Backward FFT transform in z from complex to real ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, f_dev) From eaaf2c285d72fe880b497dc0953ef7d77a9f7dfa Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 22 Feb 2024 17:22:20 +0000 Subject: [PATCH 17/26] No need to initialise backends with dirps any more. --- src/cuda/backend.f90 | 3 +-- src/omp/backend.f90 | 3 +-- src/xcompact.f90 | 4 ++-- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 741594ec..a5322ff6 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -59,12 +59,11 @@ module m_cuda_backend contains - function init(globs, allocator, xdirps, ydirps, zdirps) result(backend) + function init(globs, allocator) result(backend) implicit none class(globs_t) :: globs class(allocator_t), target, intent(inout) :: allocator - class(dirps_t), intent(in) :: xdirps, ydirps, zdirps type(cuda_backend_t) :: backend type(cuda_poisson_fft_t) :: cuda_poisson_fft diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index e1863b5a..aff888ce 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -44,12 +44,11 @@ module m_omp_backend contains - function init(globs, allocator, xdirps, ydirps, zdirps) result(backend) + function init(globs, allocator) result(backend) implicit none class(globs_t) :: globs class(allocator_t), target, intent(inout) :: allocator - class(dirps_t), intent(in) :: xdirps, ydirps, zdirps type(omp_backend_t) :: backend integer :: n_halo, n_block diff --git a/src/xcompact.f90 b/src/xcompact.f90 index d407938c..5fcbcf71 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -109,7 +109,7 @@ program xcompact allocator => cuda_allocator print*, 'CUDA allocator instantiated' - cuda_backend = cuda_backend_t(globs, allocator, xdirps, ydirps, zdirps) + cuda_backend = cuda_backend_t(globs, allocator) backend => cuda_backend print*, 'CUDA backend instantiated' #else @@ -117,7 +117,7 @@ program xcompact allocator => omp_allocator print*, 'OpenMP allocator instantiated' - omp_backend = omp_backend_t(globs, allocator, xdirps, ydirps, zdirps) + omp_backend = omp_backend_t(globs, allocator) backend => omp_backend print*, 'OpenMP backend instantiated' #endif From a26234709b25ed7226afbd81d577981d25db68ee Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 22 Feb 2024 17:29:17 +0000 Subject: [PATCH 18/26] Cleanups. --- src/omp/poisson_fft.f90 | 1 - src/poisson_fft.f90 | 11 ++--------- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 index f34a1957..e0a8cb1c 100644 --- a/src/omp/poisson_fft.f90 +++ b/src/omp/poisson_fft.f90 @@ -30,7 +30,6 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) class(dirps_t), intent(in) :: xdirps, ydirps, zdirps type(omp_poisson_fft_t) :: poisson_fft - integer :: nx, ny, nz call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 index 9dc2a892..dd5708f1 100644 --- a/src/poisson_fft.f90 +++ b/src/poisson_fft.f90 @@ -55,8 +55,6 @@ subroutine base_init(self, xdirps, ydirps, zdirps, sz) class(dirps_t), intent(in) :: xdirps, ydirps, zdirps integer, intent(in) :: sz - integer :: nx, ny, nz - self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n allocate (self%ax(self%nx), self%bx(self%nx)) @@ -84,7 +82,7 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1 complex(dp) :: xt2, yt2, zt2, xyzk - integer :: i, j, k, ka, kb, ix, iy, iz + integer :: i, j, ka, kb, ix, iy, iz nx = xdirps%n; ny = ydirps%n; nz = zdirps%n @@ -156,12 +154,7 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 end do - !do k = !sp%xst(3), sp%xen(3) - !do j = sp%xst(2), sp%xen(2) - ! kxyz array is in z pencil orientation, with nz=nz/2+1 !!nooo - ! kxyz array is in x pencil orientation. - !do k = 1, nx*ny/SZ - print*, 'set kxyz array' + print*, 'set waves array' ! TODO: do loop ranges below are valid only for single rank runs do ka = 1, nz/2 + 1 do kb = 1, ny/sz From 9824841a8f3fe4a27fdabeaebf8b8fd3a9060e0a Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 10:14:45 +0000 Subject: [PATCH 19/26] Cleanup and style fix. --- src/cuda/kernels/complex.f90 | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 index cfbf380a..9007625d 100644 --- a/src/cuda/kernels/complex.f90 +++ b/src/cuda/kernels/complex.f90 @@ -32,7 +32,7 @@ attributes(global) subroutine processfftdiv( & div_c = aimag(div(i, j, b))/(nx*ny*nz) ! get the indices for x, y, z directions - ix = j; iy = i + (b-1)/(nz/2+1)*SZ; iz = mod(b-1, nz/2+1) + 1 + ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1 ! post-process forward ! post-process in z @@ -46,16 +46,16 @@ attributes(global) subroutine processfftdiv( & tmp_c = div_c div_r = tmp_r*by(iy) + tmp_c*ay(iy) div_c = tmp_c*by(iy) - tmp_r*ay(iy) - if ( iy > ny/2 + 1 ) div_r = -div_r - if ( iy > ny/2 + 1 ) div_c = -div_c + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c ! post-process in x tmp_r = div_r tmp_c = div_c div_r = tmp_r*bx(ix) + tmp_c*ax(ix) div_c = tmp_c*bx(ix) - tmp_r*ax(ix) - if ( ix > nx/2 + 1 ) div_r = -div_r - if ( ix > nx/2 + 1 ) div_c = -div_c + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c ! Solve Poisson tmp_r = real(waves(i, j, b), kind=dp) @@ -79,16 +79,16 @@ attributes(global) subroutine processfftdiv( & tmp_c = div_c div_r = tmp_r*by(iy) + tmp_c*ay(iy) div_c = tmp_c*by(iy) - tmp_r*ay(iy) - if ( iy > ny/2 + 1 ) div_r = -div_r - if ( iy > ny/2 + 1 ) div_c = -div_c + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c ! post-process in x tmp_r = div_r tmp_c = div_c div_r = tmp_r*bx(ix) + tmp_c*ax(ix) div_c =-tmp_c*bx(ix) + tmp_r*ax(ix) - if ( ix > nx/2 + 1 ) div_r = -div_r - if ( ix > nx/2 + 1 ) div_c = -div_c + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c ! update the entry div(i, j, b) = cmplx(div_r, div_c, kind=dp) @@ -111,7 +111,7 @@ attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z ! copy into shared - tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j-1)) + tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1)) call syncthreads() @@ -166,7 +166,7 @@ attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz) ! copy into shared if ( j + (b_z - 1)*SZ <= nz ) & tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & - j + (b_z - 1)*SZ + ((b_x-1)/SZ)*nz) + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) call syncthreads() @@ -185,14 +185,11 @@ attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) complex(dp), shared :: tile(SZ, SZ) - integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z + integer :: i, j, k, b_x, b_y, b_z i = threadIdx%x j = threadIdx%y k = threadIdx%z - b_i = blockIdx%x - b_j = blockIdx%y - b_k = blockIdx%z b_x = blockIdx%z b_y = blockIdx%y @@ -206,7 +203,7 @@ attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) ! copy into output array from shared if ( j + (b_z - 1)*SZ <= nz ) & - u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ)+1, & + u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i) end subroutine reorder_cmplx_z2y_T From a5e2a1a96758b7c59d0d96bb98cb8bf8d9736549 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 11:17:15 +0000 Subject: [PATCH 20/26] Add reshape subrotines for transposing pencil groups. --- src/cuda/kernels/complex.f90 | 84 ++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 index 9007625d..0992586d 100644 --- a/src/cuda/kernels/complex.f90 +++ b/src/cuda/kernels/complex.f90 @@ -208,4 +208,88 @@ attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) end subroutine reorder_cmplx_z2y_T + attributes(global) subroutine reshapeDSF(uout, uin) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: uout + real(dp), device, intent(in), dimension(:, :, :) :: uin + + real(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) + + call syncthreads() + + uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) + + end subroutine reshapeDSF + + attributes(global) subroutine reshapeDSB(uout, uin) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: uout + real(dp), device, intent(in), dimension(:, :, :) :: uin + + real(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) + + call syncthreads() + + uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) + + end subroutine reshapeDSB + + attributes(global) subroutine reshapeCDSF(uout, uin) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: uout + complex(dp), device, intent(in), dimension(:, :, :) :: uin + + complex(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) + + call syncthreads() + + uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) + + end subroutine reshapeCDSF + + attributes(global) subroutine reshapeCDSB(uout, uin) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: uout + complex(dp), device, intent(in), dimension(:, :, :) :: uin + + complex(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) + + call syncthreads() + + uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) + + end subroutine reshapeCDSB + end module m_cuda_complex From cbb2dcd053d6150f97d5776ddaa05ad62d88757b Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 11:23:48 +0000 Subject: [PATCH 21/26] Use reshapes to carry out local transposes. --- src/cuda/poisson_fft.f90 | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 7c7b6ed1..5a5c51ad 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -24,6 +24,8 @@ module m_cuda_poisson_fft real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & ay_dev, by_dev, az_dev, bz_dev + real(dp), device, allocatable, dimension(:, :, :) :: f_tmp + integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy contains procedure :: fft_forward => fft_forward_cuda @@ -67,6 +69,11 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1))) allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1))) + ! We can't currently ask allocator to pass us an array with + ! exact shape we want, so we allocate an extra array here. + ! This will be removed when allocator is fixed. + allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ)) + ! set cufft plans ierrfft = cufftCreate(poisson_fft%planD2Zz) ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, & @@ -110,9 +117,12 @@ subroutine fft_forward_cuda(self, f) select type(f); type is (cuda_field_t); f_dev => f%data_d; end select ! First reorder f into cartesian-like data structure + blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeDSF<<>>(self%f_tmp, f_dev) ! Forward FFT transform in z from real to complex - ierrfft = cufftExecD2Z(self%planD2Zz, f_dev, self%c_z_dev) + ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev) ! Reorder from z to y blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) @@ -184,9 +194,12 @@ subroutine fft_backward_cuda(self, f) self%nx, self%nz/2 + 1) ! Backward FFT transform in z from complex to real - ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, f_dev) + ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp) ! Finally reorder f back into our specialist data structure + blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeDSB<<>>(f_dev, self%f_tmp) end subroutine fft_backward_cuda @@ -195,16 +208,21 @@ subroutine fft_postprocess_cuda(self) class(cuda_poisson_fft_t) :: self - complex(dp), device, dimension(:, :, :), pointer :: c_dev + complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr type(dim3) :: blocks, threads + ! Get some complex array pointers with right shape + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev + ! Reshape from cartesian-like to our specialist data structure + blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeCDSB<<>>(c_dev, c_x_ptr) - blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1 , 1) + ! Postprocess + blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1) threads = dim3(SZ, 1, 1) - - c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev - call processfftdiv<<>>( & c_dev, self%waves_dev, self%nx, self%ny, self%nz, & self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & @@ -212,6 +230,9 @@ subroutine fft_postprocess_cuda(self) ) ! Reshape from our specialist data structure to cartesian-like + blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeCDSF<<>>(c_x_ptr, c_dev) end subroutine fft_postprocess_cuda From 95164c30dda4a3652589170f3bc7207127ad3119 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 16:11:28 +0000 Subject: [PATCH 22/26] Change the Poisson solver selection mechanism. --- src/common.f90 | 4 +++- src/solver.f90 | 10 ++++++---- src/xcompact.f90 | 5 +++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/common.f90 b/src/common.f90 index 433f985d..31f5f29c 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -7,6 +7,8 @@ module m_common integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32 + integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 + type :: globs_t integer :: nx, ny, nz integer :: nx_loc, ny_loc, nz_loc @@ -15,7 +17,7 @@ module m_common real(dp) :: dx, dy, dz integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1 character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e - logical :: use_fft + integer :: poisson_solver_type end type globs_t contains diff --git a/src/solver.f90 b/src/solver.f90 index 80fc5326..2c598f91 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -2,7 +2,8 @@ module m_solver use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG use m_tdsops, only: tdsops_t, dirps_t use m_time_integrator, only: time_intg_t @@ -122,14 +123,15 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & call allocate_tdsops(solver%ydirps, ny, dy, solver%backend) call allocate_tdsops(solver%zdirps, nz, dz, solver%backend) - if (globs%use_fft) then + select case (globs%poisson_solver_type) + case (POISSON_SOLVER_FFT) print*, 'Poisson solver: FFT' call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps) solver%poisson => poisson_fft - else + case (POISSON_SOLVER_CG) print*, 'Poisson solver: CG' solver%poisson => poisson_cg - end if + end select end function init diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 5fcbcf71..1600d4ad 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -3,7 +3,8 @@ program xcompact use m_allocator use m_base_backend - use m_common, only: pi, globs_t, set_pprev_pnext + use m_common, only: pi, globs_t, set_pprev_pnext, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG use m_solver, only: solver_t use m_time_integrator, only: time_intg_t use m_tdsops, only: tdsops_t @@ -63,7 +64,7 @@ program xcompact ! set nprocs based on run time arguments globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 - globs%use_fft = .true. + globs%poisson_solver_type = POISSON_SOLVER_FFT ! Lets allow a 1D decomposition for the moment !globs%nproc_x = nproc From 68b47d02a1b3a0a3c2453de1a01248b0170cb286 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 16:39:51 +0000 Subject: [PATCH 23/26] Add initial conditions for TGV. --- src/common.f90 | 1 + src/solver.f90 | 37 +++++++++++++++++++++++++++++-------- src/xcompact.f90 | 5 +++-- 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/src/common.f90 b/src/common.f90 index 31f5f29c..9d0c5a86 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -15,6 +15,7 @@ module m_common integer :: n_groups_x, n_groups_y, n_groups_z real(dp) :: Lx, Ly, Lz real(dp) :: dx, dy, dz + real(dp) :: nu, dt integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1 character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e integer :: poisson_solver_type diff --git a/src/solver.f90 b/src/solver.f90 index 2c598f91..78618692 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -84,8 +84,8 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init integer :: dims(3) - real(dp) :: dx, dy, dz - integer :: nx, ny, nz + real(dp) :: x, y, z + integer :: nx, ny, nz, i, j, b, ka, kb, ix, iy, iz, sz solver%backend => backend solver%time_integrator => time_integrator @@ -108,6 +108,30 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & v_init = 0 w_init = 0 + solver%dt = globs%dt + solver%backend%nu = globs%nu + + sz = dims(1) + nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc + do ka = 1, nz + do kb = 1, ny/sz + do j = 1, nx + do i = 1, sz + ! Mapping to ix, iy, iz depends on global group numbering + ix = j; iy = (kb - 1)*sz + i; iz = ka + x = (ix - 1)*globs%dx + y = (iy - 1)*globs%dy + z = (iz - 1)*globs%dz + + b = ka + (kb - 1)*xdirps%n + u_init(i, j, b) = sin(x)*cos(y)*cos(z) + v_init(i, j, b) =-cos(x)*sin(y)*cos(z) + w_init(i, j, b) = 0 + end do + end do + end do + end do + call solver%backend%set_field(solver%u, u_init) call solver%backend%set_field(solver%v, v_init) call solver%backend%set_field(solver%w, w_init) @@ -115,13 +139,10 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & deallocate(u_init, v_init, w_init) print*, 'initial conditions are set' - nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc - dx = globs%dx; dy = globs%dy; dz = globs%dz - ! Allocate and set the tdsops - call allocate_tdsops(solver%xdirps, nx, dx, solver%backend) - call allocate_tdsops(solver%ydirps, ny, dy, solver%backend) - call allocate_tdsops(solver%zdirps, nz, dz, solver%backend) + call allocate_tdsops(solver%xdirps, nx, globs%dx, solver%backend) + call allocate_tdsops(solver%ydirps, ny, globs%dy, solver%backend) + call allocate_tdsops(solver%zdirps, nz, globs%dz, solver%backend) select case (globs%poisson_solver_type) case (POISSON_SOLVER_FFT) diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 1600d4ad..3a2726fa 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -61,6 +61,9 @@ program xcompact ! read ns from the input file globs%nx = 512; globs%ny = 512; globs%nz = 512 + globs%dt = 0.001_dp + globs%nu = 1._dp/1600._dp + ! set nprocs based on run time arguments globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 @@ -123,8 +126,6 @@ program xcompact print*, 'OpenMP backend instantiated' #endif - backend%nu = 1._dp - allocate(u(SZ, globs%nx_loc, globs%n_groups_x)) allocate(v(SZ, globs%nx_loc, globs%n_groups_x)) allocate(w(SZ, globs%nx_loc, globs%n_groups_x)) From 0a4be354aa5ad32fa7c0e4cb8bddbe48c5889b31 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Tue, 5 Mar 2024 14:46:48 +0000 Subject: [PATCH 24/26] Output enstrophy and divergence of u. --- src/common.f90 | 1 + src/solver.f90 | 57 ++++++++++++++++++++++++++++++++++++++++++------ src/xcompact.f90 | 6 +++-- 3 files changed, 55 insertions(+), 9 deletions(-) diff --git a/src/common.f90 b/src/common.f90 index 9d0c5a86..055620f8 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -16,6 +16,7 @@ module m_common real(dp) :: Lx, Ly, Lz real(dp) :: dx, dy, dz real(dp) :: nu, dt + integer :: n_iters, n_output integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1 character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e integer :: poisson_solver_type diff --git a/src/solver.f90 b/src/solver.f90 index 78618692..423f304b 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -38,6 +38,7 @@ module m_solver !! for later use. real(dp) :: dt, nu + integer :: n_iters, n_output class(field_t), pointer :: u, v, w @@ -50,6 +51,7 @@ module m_solver procedure :: divergence_v2p procedure :: gradient_p2v procedure :: curl + procedure :: output procedure :: run end type solver_t @@ -104,12 +106,10 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & allocate(v_init(dims(1), dims(2), dims(3))) allocate(w_init(dims(1), dims(2), dims(3))) - u_init = 0 - v_init = 0 - w_init = 0 - solver%dt = globs%dt solver%backend%nu = globs%nu + solver%n_iters = globs%n_iters + solver%n_output = globs%n_output sz = dims(1) nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc @@ -563,20 +563,58 @@ subroutine poisson_cg(self, pressure, div_u) end subroutine poisson_cg - subroutine run(self, n_iter, u_out, v_out, w_out) + subroutine output(self, t, u_out) + implicit none + + class(solver_t), intent(in) :: self + real(dp), intent(in) :: t + real(dp), dimension(:, :, :), intent(inout) :: u_out + + class(field_t), pointer :: du, dv, dw + integer :: ngrid + + ngrid = self%xdirps%n*self%ydirps%n*self%zdirps%n + print*, 'time = ', t + + du => self%backend%allocator%get_block() + dv => self%backend%allocator%get_block() + dw => self%backend%allocator%get_block() + + call self%curl(du, dv, dw, self%u, self%v, self%w) + print*, 'enstrophy:', 0.5_dp*( & + self%backend%scalar_product(du, du) & + + self%backend%scalar_product(dv, dv) & + + self%backend%scalar_product(dw, dw) & + )/ngrid + + call self%backend%allocator%release_block(du) + call self%backend%allocator%release_block(dv) + call self%backend%allocator%release_block(dw) + + call self%divergence_v2p(du, self%u, self%v, self%w) + call self%backend%get_field(u_out, du) + print*, 'div u max mean:', maxval(abs(u_out)), sum(abs(u_out))/ngrid + + end subroutine output + + subroutine run(self, u_out, v_out, w_out) implicit none class(solver_t), intent(in) :: self - integer, intent(in) :: n_iter real(dp), dimension(:, :, :), intent(inout) :: u_out, v_out, w_out class(field_t), pointer :: du, dv, dw, div_u, pressure, dpdx, dpdy, dpdz + real(dp) :: t integer :: i + print*, 'initial conditions' + t = 0._dp + call self%output(t, u_out) + print*, 'start run' - do i = 1, n_iter + do i = 1, self%n_iters du => self%backend%allocator%get_block() dv => self%backend%allocator%get_block() dw => self%backend%allocator%get_block() @@ -618,6 +656,11 @@ subroutine run(self, n_iter, u_out, v_out, w_out) call self%backend%allocator%release_block(dpdx) call self%backend%allocator%release_block(dpdy) call self%backend%allocator%release_block(dpdz) + + if ( mod(i, self%n_output) == 0 ) then + t = i*self%dt + call self%output(t, u_out) + end if end do print*, 'run end' diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 3a2726fa..635db2a7 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -59,10 +59,12 @@ program xcompact xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz ! read ns from the input file - globs%nx = 512; globs%ny = 512; globs%nz = 512 + globs%nx = 256; globs%ny = 256; globs%nz = 256 globs%dt = 0.001_dp globs%nu = 1._dp/1600._dp + globs%n_iters = 20000 + globs%n_output = 100 ! set nprocs based on run time arguments globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 @@ -137,7 +139,7 @@ program xcompact call cpu_time(t_start) - call solver%run(100, u, v, w) + call solver%run(u, v, w) call cpu_time(t_end) From f8a1ecdea8bb83806801661efd6d9e73fcaf9999 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 6 Mar 2024 17:25:44 +0000 Subject: [PATCH 25/26] Cleanups and comments. --- src/cuda/kernels/complex.f90 | 12 ++++++------ src/cuda/poisson_fft.f90 | 9 +++++++-- src/omp/poisson_fft.f90 | 2 ++ src/poisson_fft.f90 | 6 +++--- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 index 0992586d..32c5041f 100644 --- a/src/cuda/kernels/complex.f90 +++ b/src/cuda/kernels/complex.f90 @@ -8,7 +8,7 @@ module m_cuda_complex contains - attributes(global) subroutine processfftdiv( & + attributes(global) subroutine process_spectral_div_u( & div, waves, nx, ny, nz, ax, bx, ay, by, az, bz & ) implicit none @@ -63,8 +63,8 @@ attributes(global) subroutine processfftdiv( & if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then div_r = 0._dp; div_c = 0._dp else - div_r =-div_r/tmp_r - div_c =-div_c/tmp_c + div_r = -div_r/tmp_r + div_c = -div_c/tmp_c end if ! post-process backward @@ -72,7 +72,7 @@ attributes(global) subroutine processfftdiv( & tmp_r = div_r tmp_c = div_c div_r = tmp_r*bz(iz) - tmp_c*az(iz) - div_c =-tmp_c*bz(iz) - tmp_r*az(iz) + div_c = -tmp_c*bz(iz) - tmp_r*az(iz) ! post-process in y tmp_r = div_r @@ -86,7 +86,7 @@ attributes(global) subroutine processfftdiv( & tmp_r = div_r tmp_c = div_c div_r = tmp_r*bx(ix) + tmp_c*ax(ix) - div_c =-tmp_c*bx(ix) + tmp_r*ax(ix) + div_c = -tmp_c*bx(ix) + tmp_r*ax(ix) if (ix > nx/2 + 1) div_r = -div_r if (ix > nx/2 + 1) div_c = -div_c @@ -94,7 +94,7 @@ attributes(global) subroutine processfftdiv( & div(i, j, b) = cmplx(div_r, div_c, kind=dp) end do - end subroutine processfftdiv + end subroutine process_spectral_div_u attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) implicit none diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 5a5c51ad..3df787a2 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -11,14 +11,17 @@ module m_cuda_poisson_fft use m_cuda_common, only: SZ use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, & reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, & - processfftdiv + process_spectral_div_u implicit none type, extends(poisson_fft_t) :: cuda_poisson_fft_t !! FFT based Poisson solver !! It can only handle 1D decompositions along z direction. + + !> Local domain sized arrays to store data in spectral space complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev + !> Local domain sized array storing the spectral equivalence constants complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & @@ -37,6 +40,8 @@ module m_cuda_poisson_fft module procedure init end interface cuda_poisson_fft_t + private :: init + contains function init(xdirps, ydirps, zdirps) result(poisson_fft) @@ -223,7 +228,7 @@ subroutine fft_postprocess_cuda(self) ! Postprocess blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1) threads = dim3(SZ, 1, 1) - call processfftdiv<<>>( & + call process_spectral_div_u<<>>( & c_dev, self%waves_dev, self%nx, self%ny, self%nz, & self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & self%az_dev, self%bz_dev & diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 index e0a8cb1c..a796f1fc 100644 --- a/src/omp/poisson_fft.f90 +++ b/src/omp/poisson_fft.f90 @@ -22,6 +22,8 @@ module m_omp_poisson_fft module procedure init end interface omp_poisson_fft_t + private :: init + contains function init(xdirps, ydirps, zdirps) result(poisson_fft) diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 index dd5708f1..9ee0f575 100644 --- a/src/poisson_fft.f90 +++ b/src/poisson_fft.f90 @@ -69,6 +69,7 @@ subroutine base_init(self, xdirps, ydirps, zdirps, sz) end subroutine base_init subroutine waves_set(self, xdirps, ydirps, zdirps, sz) + !! Ref. JCP 228 (2009), 5989–6015, Sec 4 implicit none class(poisson_fft_t) :: self @@ -84,7 +85,6 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) integer :: i, j, ka, kb, ix, iy, iz - nx = xdirps%n; ny = ydirps%n; nz = zdirps%n do i = 1, nx @@ -154,13 +154,13 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 end do - print*, 'set waves array' + print*, 'waves array is correctly set only for a single rank run' ! TODO: do loop ranges below are valid only for single rank runs do ka = 1, nz/2 + 1 do kb = 1, ny/sz do j = 1, nx do i = 1, sz - ix = j; iy = (kb-1)*sz+i; iz = ka !+ xderps%nspz_st - 1 + ix = j; iy = (kb - 1)*sz + i; iz = ka rlexs = real(exs(ix), kind=dp)*xdirps%d rleys = real(eys(iy), kind=dp)*ydirps%d rlezs = real(ezs(iz), kind=dp)*zdirps%d From 370d1c4d6f9a1541228f1f99c23f90e6c26b7049 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 7 Mar 2024 12:05:43 +0000 Subject: [PATCH 26/26] Comment. --- src/solver.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/solver.f90 b/src/solver.f90 index 423f304b..c9a04a11 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -150,7 +150,7 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps) solver%poisson => poisson_fft case (POISSON_SOLVER_CG) - print*, 'Poisson solver: CG' + print*, 'Poisson solver: CG, not yet implemented' solver%poisson => poisson_cg end select