diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 330a14c1..b2116433 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -2,12 +2,14 @@ set(SRC allocator.f90 backend.f90 common.f90 + poisson_fft.f90 solver.f90 tdsops.f90 time_integrator.f90 omp/backend.f90 omp/common.f90 omp/kernels_dist.f90 + omp/poisson_fft.f90 omp/sendrecv.f90 omp/exec_dist.f90 ) @@ -17,7 +19,9 @@ set(CUDASRC cuda/common.f90 cuda/exec_dist.f90 cuda/kernels/distributed.f90 + cuda/kernels/complex.f90 cuda/kernels/reorder.f90 + cuda/poisson_fft.f90 cuda/sendrecv.f90 cuda/tdsops.f90 ) @@ -40,6 +44,7 @@ if(${CMAKE_Fortran_COMPILER_ID} STREQUAL "PGI" OR set(CMAKE_Fortran_FLAGS_DEBUG "-g -O0 -traceback -Mbounds -Mchkptr -Ktrap=fp") set(CMAKE_Fortran_FLAGS_RELEASE "-O3 -fast") target_link_options(x3d2 INTERFACE "-cuda") + target_link_options(x3d2 INTERFACE "-lcufft") target_compile_options(xcompact PRIVATE "-DCUDA") # target_link_options(xcompact INTERFACE "-cuda") diff --git a/src/backend.f90 b/src/backend.f90 index cf453dad..e17970e5 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -1,6 +1,7 @@ module m_base_backend use m_allocator, only: allocator_t, field_t use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: tdsops_t, dirps_t implicit none @@ -24,6 +25,7 @@ module m_base_backend integer :: nx_loc, ny_loc, nz_loc class(allocator_t), pointer :: allocator class(dirps_t), pointer :: xdirps, ydirps, zdirps + class(poisson_fft_t), pointer :: poisson_fft contains procedure(transeq_ders), deferred :: transeq_x procedure(transeq_ders), deferred :: transeq_y @@ -37,6 +39,7 @@ module m_base_backend procedure(get_field), deferred :: get_field procedure(set_field), deferred :: set_field procedure(alloc_tdsops), deferred :: alloc_tdsops + procedure(init_poisson_fft), deferred :: init_poisson_fft end type base_backend_t abstract interface @@ -188,4 +191,15 @@ subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme, n_halo, & end subroutine alloc_tdsops end interface + abstract interface + subroutine init_poisson_fft(self, xdirps, ydirps, zdirps) + import :: base_backend_t + import :: dirps_t + implicit none + + class(base_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + end subroutine init_poisson_fft + end interface + end module m_base_backend diff --git a/src/common.f90 b/src/common.f90 index 3b46919c..055620f8 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -7,14 +7,19 @@ module m_common integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32 + integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 + type :: globs_t integer :: nx, ny, nz integer :: nx_loc, ny_loc, nz_loc integer :: n_groups_x, n_groups_y, n_groups_z real(dp) :: Lx, Ly, Lz real(dp) :: dx, dy, dz + real(dp) :: nu, dt + integer :: n_iters, n_output integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1 character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e + integer :: poisson_solver_type end type globs_t contains diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 1097d574..a5322ff6 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -7,11 +7,13 @@ module m_cuda_backend use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t, & RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t, tdsops_t use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t use m_cuda_common, only: SZ use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact + use m_cuda_poisson_fft, only: cuda_poisson_fft_t use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields use m_cuda_tdsops, only: cuda_tdsops_t use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs @@ -45,6 +47,7 @@ module m_cuda_backend procedure :: scalar_product => scalar_product_cuda procedure :: set_field => set_field_cuda procedure :: get_field => get_field_cuda + procedure :: init_poisson_fft => init_cuda_poisson_fft procedure :: transeq_cuda_dist procedure :: transeq_cuda_thom procedure :: tds_solve_dist @@ -63,6 +66,7 @@ function init(globs, allocator) result(backend) class(allocator_t), target, intent(inout) :: allocator type(cuda_backend_t) :: backend + type(cuda_poisson_fft_t) :: cuda_poisson_fft integer :: n_halo, n_block select type(allocator) @@ -600,5 +604,20 @@ subroutine get_field_cuda(self, arr, f) end subroutine get_field_cuda + subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps) + implicit none + + class(cuda_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + allocate(cuda_poisson_fft_t :: self%poisson_fft) + + select type (poisson_fft => self%poisson_fft) + type is (cuda_poisson_fft_t) + poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps) + end select + + end subroutine init_cuda_poisson_fft + end module m_cuda_backend diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 new file mode 100644 index 00000000..32c5041f --- /dev/null +++ b/src/cuda/kernels/complex.f90 @@ -0,0 +1,295 @@ +module m_cuda_complex + use cudafor + + use m_common, only: dp + use m_cuda_common, only: SZ + + implicit none + +contains + + attributes(global) subroutine process_spectral_div_u( & + div, waves, nx, ny, nz, ax, bx, ay, by, az, bz & + ) + implicit none + + ! Arguments + complex(dp), device, intent(inout), dimension(:, :, :) :: div + complex(dp), device, intent(in), dimension(:, :, :) :: waves + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz + integer, value, intent(in) :: nx, ny, nz + + ! Local variables + integer :: i, j, b, ix, iy, iz + real(dp) :: tmp_r, tmp_c, div_r, div_c + + i = threadIdx%x + b = blockIdx%x + + do j = 1, nx + ! normalisation + div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz) + div_c = aimag(div(i, j, b))/(nx*ny*nz) + + ! get the indices for x, y, z directions + ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1 + + ! post-process forward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) + tmp_c*az(iz) + div_c = tmp_c*bz(iz) - tmp_r*az(iz) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c = tmp_c*bx(ix) - tmp_r*ax(ix) + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c + + ! Solve Poisson + tmp_r = real(waves(i, j, b), kind=dp) + tmp_c = aimag(waves(i, j, b)) + if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then + div_r = 0._dp; div_c = 0._dp + else + div_r = -div_r/tmp_r + div_c = -div_c/tmp_c + end if + + ! post-process backward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) - tmp_c*az(iz) + div_c = -tmp_c*bz(iz) - tmp_r*az(iz) + + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c + + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c = -tmp_c*bx(ix) + tmp_r*ax(ix) + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c + + ! update the entry + div(i, j, b) = cmplx(div_r, div_c, kind=dp) + end do + + end subroutine process_spectral_div_u + + attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_y + complex(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1)) + + call syncthreads() + + ! copy into output array from shared + u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i) + + end subroutine reorder_cmplx_x2y_T + + attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_x + complex(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1)) + + call syncthreads() + + ! copy into output array from shared + u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i) + + end subroutine reorder_cmplx_y2x_T + + attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_z + complex(dp), device, intent(in), dimension(:, :, :) :: u_y + integer, value, intent(in) :: nx, nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z + + i = threadIdx%x + j = threadIdx%y + k = threadIdx%z + + b_x = blockIdx%z + b_y = blockIdx%y + b_z = blockIdx%x + + ! copy into shared + if ( j + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) + + call syncthreads() + + ! copy into output array from shared + if ( i + (b_z - 1)*SZ <= nz ) & + u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i) + + end subroutine reorder_cmplx_y2z_T + + attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: u_y + complex(dp), device, intent(in), dimension(:, :, :) :: u_z + integer, value, intent(in) :: nx, nz + + complex(dp), shared :: tile(SZ, SZ) + + integer :: i, j, k, b_x, b_y, b_z + + i = threadIdx%x + j = threadIdx%y + k = threadIdx%z + + b_x = blockIdx%z + b_y = blockIdx%y + b_z = blockIdx%x + + ! copy into shared + if ( i + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) + + call syncthreads() + + ! copy into output array from shared + if ( j + (b_z - 1)*SZ <= nz ) & + u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i) + + end subroutine reorder_cmplx_z2y_T + + attributes(global) subroutine reshapeDSF(uout, uin) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: uout + real(dp), device, intent(in), dimension(:, :, :) :: uin + + real(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) + + call syncthreads() + + uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) + + end subroutine reshapeDSF + + attributes(global) subroutine reshapeDSB(uout, uin) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: uout + real(dp), device, intent(in), dimension(:, :, :) :: uin + + real(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) + + call syncthreads() + + uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) + + end subroutine reshapeDSB + + attributes(global) subroutine reshapeCDSF(uout, uin) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: uout + complex(dp), device, intent(in), dimension(:, :, :) :: uin + + complex(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) + + call syncthreads() + + uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) + + end subroutine reshapeCDSF + + attributes(global) subroutine reshapeCDSB(uout, uin) + implicit none + + complex(dp), device, intent(out), dimension(:, :, :) :: uout + complex(dp), device, intent(in), dimension(:, :, :) :: uin + + complex(dp), shared :: tile(SZ + 1, SZ) + + integer :: i, j, b_i, b + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b = blockIdx%y + + tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) + + call syncthreads() + + uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) + + end subroutine reshapeCDSB + +end module m_cuda_complex diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 new file mode 100644 index 00000000..3df787a2 --- /dev/null +++ b/src/cuda/poisson_fft.f90 @@ -0,0 +1,244 @@ +module m_cuda_poisson_fft + use cudafor + use cufft + + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t + + use m_cuda_allocator, only: cuda_field_t + use m_cuda_common, only: SZ + use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, & + reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, & + process_spectral_div_u + + implicit none + + type, extends(poisson_fft_t) :: cuda_poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + + !> Local domain sized arrays to store data in spectral space + complex(dp), device, pointer, dimension(:) :: c_x_dev, c_y_dev, c_z_dev + !> Local domain sized array storing the spectral equivalence constants + complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev + + real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & + ay_dev, by_dev, az_dev, bz_dev + + real(dp), device, allocatable, dimension(:, :, :) :: f_tmp + + integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy + contains + procedure :: fft_forward => fft_forward_cuda + procedure :: fft_backward => fft_backward_cuda + procedure :: fft_postprocess => fft_postprocess_cuda + end type cuda_poisson_fft_t + + interface cuda_poisson_fft_t + module procedure init + end interface cuda_poisson_fft_t + + private :: init + +contains + + function init(xdirps, ydirps, zdirps) result(poisson_fft) + implicit none + + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + type(cuda_poisson_fft_t) :: poisson_fft + + integer :: nx, ny, nz + + integer :: ierrfft + integer(int_ptr_kind()) :: worksize + + call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) + + nx = poisson_fft%nx; ny = poisson_fft%ny; nz = poisson_fft%nz + + allocate (poisson_fft%waves_dev(SZ, nx, (ny*(nz/2 + 1))/SZ)) + poisson_fft%waves_dev = poisson_fft%waves + + allocate (poisson_fft%ax_dev(nx), poisson_fft%bx_dev(nx)) + allocate (poisson_fft%ay_dev(ny), poisson_fft%by_dev(ny)) + allocate (poisson_fft%az_dev(nz), poisson_fft%bz_dev(nz)) + poisson_fft%ax_dev = poisson_fft%ax; poisson_fft%bx_dev = poisson_fft%bx + poisson_fft%ay_dev = poisson_fft%ay; poisson_fft%by_dev = poisson_fft%by + poisson_fft%az_dev = poisson_fft%az; poisson_fft%bz_dev = poisson_fft%bz + + allocate (poisson_fft%c_x_dev(nx*ny*(nz/2 + 1))) + allocate (poisson_fft%c_y_dev(nx*ny*(nz/2 + 1))) + allocate (poisson_fft%c_z_dev(nx*ny*(nz/2 + 1))) + + ! We can't currently ask allocator to pass us an array with + ! exact shape we want, so we allocate an extra array here. + ! This will be removed when allocator is fixed. + allocate (poisson_fft%f_tmp(nz, SZ, nx*ny/SZ)) + + ! set cufft plans + ierrfft = cufftCreate(poisson_fft%planD2Zz) + ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, & + nz, 1, nz, nz/2+1, 1, nz/2+1, & + CUFFT_D2Z, nx*ny, worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(poisson_fft%planZ2Dz) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, & + nz/2+1, 1, nz/2+1, nz, 1, nz, & + CUFFT_Z2D, nx*ny, worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(poisson_fft%planZ2Zy) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, & + ny, 1, ny, ny, 1, ny, & + CUFFT_Z2Z, nx*(nz/2 + 1), worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev) + + ierrfft = cufftCreate(poisson_fft%planZ2Zx) + ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, & + nx, 1, nx, nx, 1, nx, & + CUFFT_Z2Z, ny*(nz/2 + 1), worksize) + ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev) + + end function init + + subroutine fft_forward_cuda(self, f) + implicit none + + class(cuda_poisson_fft_t) :: self + class(field_t), intent(in) :: f + + real(dp), device, pointer, dimension(:, :, :) :: f_dev + complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & + c_z_ptr + + type(dim3) :: blocks, threads + integer :: ierrfft + + select type(f); type is (cuda_field_t); f_dev => f%data_d; end select + + ! First reorder f into cartesian-like data structure + blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeDSF<<>>(self%f_tmp, f_dev) + + ! Forward FFT transform in z from real to complex + ierrfft = cufftExecD2Z(self%planD2Zz, self%f_tmp, self%c_z_dev) + + ! Reorder from z to y + blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) + threads = dim3(SZ, SZ, 1) + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev + + call reorder_cmplx_z2y_T<<>>(c_y_ptr, c_z_ptr, & + self%nx, self%nz/2 + 1) + + ! In-place forward FFT in y + ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & + CUFFT_FORWARD) + + ! Reorder from y to x + blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) + threads = dim3(SZ, SZ, 1) + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + + call reorder_cmplx_y2x_T<<>>(c_x_ptr, c_y_ptr, & + self%nz/2 + 1) + + ! In-place forward FFT in x + ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & + CUFFT_FORWARD) + + end subroutine fft_forward_cuda + + subroutine fft_backward_cuda(self, f) + implicit none + + class(cuda_poisson_fft_t) :: self + class(field_t), intent(inout) :: f + + real(dp), device, pointer, dimension(:, :, :) :: f_dev + complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & + c_z_ptr + + type(dim3) :: blocks, threads + integer :: ierrfft + + select type(f); type is (cuda_field_t); f_dev => f%data_d; end select + + ! In-place backward FFT in x + ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & + CUFFT_INVERSE) + + ! Reorder from x to y + blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) + threads = dim3(SZ, SZ, 1) + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + + call reorder_cmplx_x2y_T<<>>(c_y_ptr, c_x_ptr, & + self%nz/2 + 1) + + ! In-place backward FFT in y + ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & + CUFFT_INVERSE) + + ! Reorder from y to z + blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) + threads = dim3(SZ, SZ, 1) + c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev + c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev + + call reorder_cmplx_y2z_T<<>>(c_z_ptr, c_y_ptr, & + self%nx, self%nz/2 + 1) + + ! Backward FFT transform in z from complex to real + ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp) + + ! Finally reorder f back into our specialist data structure + blocks = dim3(self%nz/SZ, self%nx*self%ny/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeDSB<<>>(f_dev, self%f_tmp) + + end subroutine fft_backward_cuda + + subroutine fft_postprocess_cuda(self) + implicit none + + class(cuda_poisson_fft_t) :: self + + complex(dp), device, dimension(:, :, :), pointer :: c_dev, c_x_ptr + type(dim3) :: blocks, threads + + ! Get some complex array pointers with right shape + c_x_ptr(1:self%nx, 1:SZ, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_x_dev + c_dev(1:SZ, 1:self%nx, 1:(self%ny*(self%nz/2 + 1))/SZ) => self%c_y_dev + + ! Reshape from cartesian-like to our specialist data structure + blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeCDSB<<>>(c_dev, c_x_ptr) + + ! Postprocess + blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1) + threads = dim3(SZ, 1, 1) + call process_spectral_div_u<<>>( & + c_dev, self%waves_dev, self%nx, self%ny, self%nz, & + self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & + self%az_dev, self%bz_dev & + ) + + ! Reshape from our specialist data structure to cartesian-like + blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) + threads = dim3(SZ, SZ, 1) + call reshapeCDSF<<>>(c_x_ptr, c_dev) + + end subroutine fft_postprocess_cuda + +end module m_cuda_poisson_fft diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index aca287bd..aff888ce 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -7,6 +7,7 @@ module m_omp_backend use m_omp_sendrecv, only: sendrecv_fields use m_omp_common, only: SZ + use m_omp_poisson_fft, only: omp_poisson_fft_t implicit none @@ -33,6 +34,7 @@ module m_omp_backend procedure :: scalar_product => scalar_product_omp procedure :: set_field => set_field_omp procedure :: get_field => get_field_omp + procedure :: init_poisson_fft => init_omp_poisson_fft procedure :: transeq_omp_dist end type omp_backend_t @@ -362,5 +364,20 @@ subroutine get_field_omp(self, arr, f) end subroutine get_field_omp + subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) + implicit none + + class(omp_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + allocate(omp_poisson_fft_t :: self%poisson_fft) + + select type (poisson_fft => self%poisson_fft) + type is (omp_poisson_fft_t) + poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) + end select + + end subroutine init_omp_poisson_fft + end module m_omp_backend diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 new file mode 100644 index 00000000..a796f1fc --- /dev/null +++ b/src/omp/poisson_fft.f90 @@ -0,0 +1,60 @@ +module m_omp_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t + + use m_omp_common, only: SZ + + implicit none + + type, extends(poisson_fft_t) :: omp_poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z + contains + procedure :: fft_forward => fft_forward_omp + procedure :: fft_backward => fft_backward_omp + procedure :: fft_postprocess => fft_postprocess_omp + end type omp_poisson_fft_t + + interface omp_poisson_fft_t + module procedure init + end interface omp_poisson_fft_t + + private :: init + +contains + + function init(xdirps, ydirps, zdirps) result(poisson_fft) + implicit none + + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + + type(omp_poisson_fft_t) :: poisson_fft + + call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) + + end function init + + subroutine fft_forward_omp(self, f_in) + implicit none + + class(omp_poisson_fft_t) :: self + class(field_t), intent(in) :: f_in + end subroutine fft_forward_omp + + subroutine fft_backward_omp(self, f_out) + implicit none + + class(omp_poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out + end subroutine fft_backward_omp + + subroutine fft_postprocess_omp(self) + implicit none + + class(omp_poisson_fft_t) :: self + end subroutine fft_postprocess_omp + +end module m_omp_poisson_fft diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 new file mode 100644 index 00000000..9ee0f575 --- /dev/null +++ b/src/poisson_fft.f90 @@ -0,0 +1,198 @@ +module m_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp, pi + use m_tdsops, only: dirps_t + + implicit none + + type, abstract :: poisson_fft_t + !! FFT based Poisson solver + !! It can only handle 1D decompositions along z direction. + integer :: nx, ny, nz + complex(dp), allocatable, dimension(:, :, :) :: waves + complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz + contains + procedure(fft_forward), deferred :: fft_forward + procedure(fft_backward), deferred :: fft_backward + procedure(fft_postprocess), deferred :: fft_postprocess + procedure :: base_init + procedure :: waves_set + end type poisson_fft_t + + abstract interface + subroutine fft_forward(self, f_in) + import :: poisson_fft_t + import :: field_t + implicit none + + class(poisson_fft_t) :: self + class(field_t), intent(in) :: f_in + end subroutine fft_forward + + subroutine fft_backward(self, f_out) + import :: poisson_fft_t + import :: field_t + implicit none + + class(poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out + end subroutine fft_backward + + subroutine fft_postprocess(self) + import :: poisson_fft_t + implicit none + + class(poisson_fft_t) :: self + end subroutine fft_postprocess + end interface + +contains + + subroutine base_init(self, xdirps, ydirps, zdirps, sz) + implicit none + + class(poisson_fft_t) :: self + class(dirps_t), intent(in) :: xdirps, ydirps, zdirps + integer, intent(in) :: sz + + self%nx = xdirps%n; self%ny = ydirps%n; self%nz = zdirps%n + + allocate (self%ax(self%nx), self%bx(self%nx)) + allocate (self%ay(self%nx), self%by(self%nx)) + allocate (self%az(self%nx), self%bz(self%nx)) + + allocate (self%waves(sz, self%nx, (self%ny*(self%nz/2 + 1))/sz)) + + ! waves_set requires some of the preprocessed tdsops variables. + call self%waves_set(xdirps, ydirps, zdirps, sz) + + end subroutine base_init + + subroutine waves_set(self, xdirps, ydirps, zdirps, sz) + !! Ref. JCP 228 (2009), 5989–6015, Sec 4 + implicit none + + class(poisson_fft_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + integer, intent(in) :: sz + + complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, & + exs, eys, ezs + + integer :: nx, ny, nz + real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1 + complex(dp) :: xt2, yt2, zt2, xyzk + + integer :: i, j, ka, kb, ix, iy, iz + + nx = xdirps%n; ny = ydirps%n; nz = zdirps%n + + do i = 1, nx + self%ax(i) = sin((i-1)*pi/nx) + self%bx(i) = cos((i-1)*pi/nx) + end do + + do i = 1, ny + self%ay(i) = sin((i-1)*pi/ny) + self%by(i) = cos((i-1)*pi/ny) + end do + + do i = 1, nz + self%az(i) = sin((i-1)*pi/nz) + self%bz(i) = cos((i-1)*pi/nz) + end do + + ! Now kxyz + allocate(xkx(nx), xk2(nx), exs(nx)) + allocate(yky(ny), yk2(ny), eys(ny)) + allocate(zkz(nz), zk2(nz), ezs(nz)) + xkx(:) = 0; xk2(:) = 0; yky(:) = 0; yk2(:) = 0; zkz(:) = 0; zk2(:) = 0 + + ! periodic-x + do i = 1, nx/2 + 1 + w = 2*pi*(i - 1)/nx + wp = xdirps%stagder_v2p%a*2*xdirps%d*sin(0.5_dp*w) & + + xdirps%stagder_v2p%b*2*xdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*xdirps%stagder_v2p%alpha*cos(w)) + + xkx(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L) + exs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*w/xdirps%L) + xk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)**2 + end do + do i = nx/2 + 2, nx + xkx(i) = xkx(nx - i + 2) + exs(i) = exs(nx - i + 2) + xk2(i) = xk2(nx - i + 2) + end do + + ! periodic-y + do i = 1, ny/2 + 1 + w = 2*pi*(i - 1)/ny + wp = ydirps%stagder_v2p%a*2*ydirps%d*sin(0.5_dp*w) & + + ydirps%stagder_v2p%b*2*ydirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*ydirps%stagder_v2p%alpha*cos(w)) + + yky(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L) + eys(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*w/ydirps%L) + yk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)**2 + end do + do i = ny/2 + 2, ny + yky(i) = yky(ny-i+2) + eys(i) = eys(ny-i+2) + yk2(i) = yk2(ny-i+2) + end do + + ! periodic-z + do i = 1, nz/2 + 1 + w = 2*pi*(i - 1)/nz + wp = zdirps%stagder_v2p%a*2*zdirps%d*sin(0.5_dp*w) & + + zdirps%stagder_v2p%b*2*zdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*zdirps%stagder_v2p%alpha*cos(w)) + + zkz(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L) + ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L) + zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 + end do + + print*, 'waves array is correctly set only for a single rank run' + ! TODO: do loop ranges below are valid only for single rank runs + do ka = 1, nz/2 + 1 + do kb = 1, ny/sz + do j = 1, nx + do i = 1, sz + ix = j; iy = (kb - 1)*sz + i; iz = ka + rlexs = real(exs(ix), kind=dp)*xdirps%d + rleys = real(eys(iy), kind=dp)*ydirps%d + rlezs = real(ezs(iz), kind=dp)*zdirps%d + + xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) & + + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) & + + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) & + + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp)) + ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) & + + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) & + + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) & + + ydirps%interpl_v2p%d*cos(rleys*3.5_dp)) + ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) & + + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) & + + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) & + + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp)) + + xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs) + yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys) + zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs) + + xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2) + yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2) + zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2) + + xyzk = xt2 + yt2 + zt2 + self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk + end do + end do + end do + end do + + end subroutine waves_set + +end module m_poisson_fft diff --git a/src/solver.f90 b/src/solver.f90 index fad0a715..c9a04a11 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -2,7 +2,8 @@ module m_solver use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG use m_tdsops, only: tdsops_t, dirps_t use m_time_integrator, only: time_intg_t @@ -37,23 +38,39 @@ module m_solver !! for later use. real(dp) :: dt, nu + integer :: n_iters, n_output class(field_t), pointer :: u, v, w class(base_backend_t), pointer :: backend class(dirps_t), pointer :: xdirps, ydirps, zdirps class(time_intg_t), pointer :: time_integrator + procedure(poisson_solver), pointer :: poisson => null() contains procedure :: transeq procedure :: divergence_v2p procedure :: gradient_p2v procedure :: curl + procedure :: output procedure :: run end type solver_t + abstract interface + subroutine poisson_solver(self, pressure, div_u) + import :: solver_t + import :: field_t + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + end subroutine poisson_solver + end interface + interface solver_t module procedure init end interface solver_t + contains function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & @@ -69,8 +86,8 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & real(dp), allocatable, dimension(:, :, :) :: u_init, v_init, w_init integer :: dims(3) - real(dp) :: dx, dy, dz - integer :: nx, ny, nz + real(dp) :: x, y, z + integer :: nx, ny, nz, i, j, b, ka, kb, ix, iy, iz, sz solver%backend => backend solver%time_integrator => time_integrator @@ -89,9 +106,31 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & allocate(v_init(dims(1), dims(2), dims(3))) allocate(w_init(dims(1), dims(2), dims(3))) - u_init = 0 - v_init = 0 - w_init = 0 + solver%dt = globs%dt + solver%backend%nu = globs%nu + solver%n_iters = globs%n_iters + solver%n_output = globs%n_output + + sz = dims(1) + nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc + do ka = 1, nz + do kb = 1, ny/sz + do j = 1, nx + do i = 1, sz + ! Mapping to ix, iy, iz depends on global group numbering + ix = j; iy = (kb - 1)*sz + i; iz = ka + x = (ix - 1)*globs%dx + y = (iy - 1)*globs%dy + z = (iz - 1)*globs%dz + + b = ka + (kb - 1)*xdirps%n + u_init(i, j, b) = sin(x)*cos(y)*cos(z) + v_init(i, j, b) =-cos(x)*sin(y)*cos(z) + w_init(i, j, b) = 0 + end do + end do + end do + end do call solver%backend%set_field(solver%u, u_init) call solver%backend%set_field(solver%v, v_init) @@ -100,13 +139,20 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & deallocate(u_init, v_init, w_init) print*, 'initial conditions are set' - nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc - dx = globs%dx; dy = globs%dy; dz = globs%dz - ! Allocate and set the tdsops - call allocate_tdsops(solver%xdirps, nx, dx, solver%backend) - call allocate_tdsops(solver%ydirps, ny, dy, solver%backend) - call allocate_tdsops(solver%zdirps, nz, dz, solver%backend) + call allocate_tdsops(solver%xdirps, nx, globs%dx, solver%backend) + call allocate_tdsops(solver%ydirps, ny, globs%dy, solver%backend) + call allocate_tdsops(solver%zdirps, nz, globs%dz, solver%backend) + + select case (globs%poisson_solver_type) + case (POISSON_SOLVER_FFT) + print*, 'Poisson solver: FFT' + call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps) + solver%poisson => poisson_fft + case (POISSON_SOLVER_CG) + print*, 'Poisson solver: CG, not yet implemented' + solver%poisson => poisson_cg + end select end function init @@ -489,20 +535,86 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) end subroutine curl - subroutine run(self, n_iter, u_out, v_out, w_out) + subroutine poisson_fft(self, pressure, div_u) + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + + ! call forward FFT + ! output array in spectral space is stored at poisson_fft class + call self%backend%poisson_fft%fft_forward(div_u) + + ! postprocess + call self%backend%poisson_fft%fft_postprocess + + ! call backward FFT + call self%backend%poisson_fft%fft_backward(pressure) + + end subroutine poisson_fft + + subroutine poisson_cg(self, pressure, div_u) + implicit none + + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u + + end subroutine poisson_cg + + subroutine output(self, t, u_out) + implicit none + + class(solver_t), intent(in) :: self + real(dp), intent(in) :: t + real(dp), dimension(:, :, :), intent(inout) :: u_out + + class(field_t), pointer :: du, dv, dw + integer :: ngrid + + ngrid = self%xdirps%n*self%ydirps%n*self%zdirps%n + print*, 'time = ', t + + du => self%backend%allocator%get_block() + dv => self%backend%allocator%get_block() + dw => self%backend%allocator%get_block() + + call self%curl(du, dv, dw, self%u, self%v, self%w) + print*, 'enstrophy:', 0.5_dp*( & + self%backend%scalar_product(du, du) & + + self%backend%scalar_product(dv, dv) & + + self%backend%scalar_product(dw, dw) & + )/ngrid + + call self%backend%allocator%release_block(du) + call self%backend%allocator%release_block(dv) + call self%backend%allocator%release_block(dw) + + call self%divergence_v2p(du, self%u, self%v, self%w) + call self%backend%get_field(u_out, du) + print*, 'div u max mean:', maxval(abs(u_out)), sum(abs(u_out))/ngrid + + end subroutine output + + subroutine run(self, u_out, v_out, w_out) implicit none class(solver_t), intent(in) :: self - integer, intent(in) :: n_iter real(dp), dimension(:, :, :), intent(inout) :: u_out, v_out, w_out class(field_t), pointer :: du, dv, dw, div_u, pressure, dpdx, dpdy, dpdz + real(dp) :: t integer :: i + print*, 'initial conditions' + t = 0._dp + call self%output(t, u_out) + print*, 'start run' - do i = 1, n_iter + do i = 1, self%n_iters du => self%backend%allocator%get_block() dv => self%backend%allocator%get_block() dw => self%backend%allocator%get_block() @@ -524,7 +636,7 @@ subroutine run(self, n_iter, u_out, v_out, w_out) pressure => self%backend%allocator%get_block() - !call self%poisson(pressure, div_u) + call self%poisson(pressure, div_u) call self%backend%allocator%release_block(div_u) @@ -544,6 +656,11 @@ subroutine run(self, n_iter, u_out, v_out, w_out) call self%backend%allocator%release_block(dpdx) call self%backend%allocator%release_block(dpdy) call self%backend%allocator%release_block(dpdz) + + if ( mod(i, self%n_output) == 0 ) then + t = i*self%dt + call self%output(t, u_out) + end if end do print*, 'run end' diff --git a/src/tdsops.f90 b/src/tdsops.f90 index c159cdd2..3db7b4d9 100644 --- a/src/tdsops.f90 +++ b/src/tdsops.f90 @@ -27,6 +27,7 @@ module m_tdsops dist_sa, dist_sc, & !! back subs. dist_af !! the auxiliary factors real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :) + real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp integer :: n, n_halo contains procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st, preprocess @@ -46,6 +47,7 @@ module m_tdsops stagder_v2p, stagder_p2v, & interpl_v2p, interpl_p2v integer :: nrank, nproc, pnext, pprev, n, n_blocks + real(dp) :: L, d end type dirps_t contains @@ -154,6 +156,9 @@ subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = afi; self%b = bfi + self%coeffs(:) = [0._dp, 0._dp, -bfi, -afi, & 0._dp, & afi, bfi, 0._dp, 0._dp] @@ -321,6 +326,9 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = asi; self%b = bsi; self%c = csi; self%d = dsi + self%coeffs(:) = [dsi, csi, bsi, asi, & -2._dp*(asi + bsi + csi + dsi), & asi, bsi, csi, dsi] @@ -511,6 +519,9 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = aici; self%b = bici; self%c = cici; self%d = dici + select case (from_to) case ('v2p') self%coeffs(:) = [0._dp, dici, cici, bici, & @@ -633,6 +644,9 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) error stop 'scheme is not defined' end select + self%alpha = alpha + self%a = aci; self%b = bci + select case (from_to) case ('v2p') self%coeffs(:) = [0._dp, 0._dp, 0._dp, -bci, & diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 779e09f0..635db2a7 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -3,7 +3,8 @@ program xcompact use m_allocator use m_base_backend - use m_common, only: pi, globs_t, set_pprev_pnext + use m_common, only: pi, globs_t, set_pprev_pnext, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG use m_solver, only: solver_t use m_time_integrator, only: time_intg_t use m_tdsops, only: tdsops_t @@ -55,13 +56,21 @@ program xcompact ! read L_x/y/z from the input file globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi + xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz ! read ns from the input file - globs%nx = 512; globs%ny = 512; globs%nz = 512 + globs%nx = 256; globs%ny = 256; globs%nz = 256 + + globs%dt = 0.001_dp + globs%nu = 1._dp/1600._dp + globs%n_iters = 20000 + globs%n_output = 100 ! set nprocs based on run time arguments globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 + globs%poisson_solver_type = POISSON_SOLVER_FFT + ! Lets allow a 1D decomposition for the moment !globs%nproc_x = nproc @@ -91,6 +100,8 @@ program xcompact globs%dy = globs%Ly/globs%ny globs%dz = globs%Lz/globs%nz + xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz + xdirps%n = globs%nx_loc ydirps%n = globs%ny_loc zdirps%n = globs%nz_loc @@ -117,8 +128,6 @@ program xcompact print*, 'OpenMP backend instantiated' #endif - backend%nu = 1._dp - allocate(u(SZ, globs%nx_loc, globs%n_groups_x)) allocate(v(SZ, globs%nx_loc, globs%n_groups_x)) allocate(w(SZ, globs%nx_loc, globs%n_groups_x)) @@ -130,7 +139,7 @@ program xcompact call cpu_time(t_start) - call solver%run(100, u, v, w) + call solver%run(u, v, w) call cpu_time(t_end)