Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add reorder kernels for switching from/to Cartesian ordering #76

Merged
merged 11 commits into from Mar 21, 2024
105 changes: 87 additions & 18 deletions src/backend.f90
@@ -1,6 +1,6 @@
module m_base_backend
use m_allocator, only: allocator_t, field_t
use m_common, only: dp
use m_common, only: dp, DIR_C, get_rdr_from_dirs
use m_poisson_fft, only: poisson_fft_t
use m_tdsops, only: tdsops_t, dirps_t

Expand Down Expand Up @@ -36,10 +36,12 @@ module m_base_backend
procedure(sum_intox), deferred :: sum_zintox
procedure(vecadd), deferred :: vecadd
procedure(scalar_product), deferred :: scalar_product
procedure(get_field), deferred :: get_field
procedure(set_field), deferred :: set_field
procedure(copy_data_to_f), deferred :: copy_data_to_f
procedure(copy_f_to_data), deferred :: copy_f_to_data
procedure(alloc_tdsops), deferred :: alloc_tdsops
procedure(init_poisson_fft), deferred :: init_poisson_fft
procedure :: get_field_data
procedure :: set_field_data
end type base_backend_t

abstract interface
Expand Down Expand Up @@ -143,32 +145,31 @@ end function scalar_product
end interface

abstract interface
subroutine get_field(self, arr, f)
!! copy the specialist data structure from device or host back
!! to a regular 3D data structure.
subroutine copy_data_to_f(self, f, data)
!! Copy the specialist data structure from device or host back
!! to a regular 3D data array in host memory.
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: arr
class(field_t), intent(in) :: f
end subroutine get_field
class(base_backend_t), intent(inout) :: self
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: data
end subroutine copy_data_to_f

subroutine set_field(self, f, arr)
!! copy the initial condition stored in a regular 3D data
!! structure into the specialist data structure array on the
!! device or host.
subroutine copy_f_to_data(self, data, f)
!! Copy a regular 3D array in host memory into the specialist
!! data structure field that lives on device or host
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: arr
end subroutine set_field
class(base_backend_t), intent(inout) :: self
real(dp), dimension(:, :, :), intent(out) :: data
class(field_t), intent(in) :: f
end subroutine copy_f_to_data
end interface

abstract interface
Expand Down Expand Up @@ -202,4 +203,72 @@ subroutine init_poisson_fft(self, xdirps, ydirps, zdirps)
end subroutine init_poisson_fft
end interface

contains

subroutine get_field_data(self, data, f, dir)
semi-h marked this conversation as resolved.
Show resolved Hide resolved
!! Extract data from field `f` optionally reordering into `dir` orientation.
!! To output in same orientation as `f`, use `call ...%get_field_data(data, f, f%dir)`
implicit none

class(base_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: data !! Output array
class(field_t), intent(in) :: f !! Field
integer, optional, intent(in) :: dir !! Desired orientation of output array (defaults to Cartesian)

class(field_t), pointer :: f_temp
integer :: direction, rdr_dir

if (present(dir)) then
direction = dir
else
direction = DIR_C
end if

! Returns 0 if no reorder required
rdr_dir = get_rdr_from_dirs(direction, f%dir)

! Carry out a reorder if we need, and copy from field to data array
if (rdr_dir /= 0) then
f_temp => self%allocator%get_block(direction)
call self%reorder(f_temp, f, rdr_dir)
call self%copy_f_to_data(data, f_temp)
call self%allocator%release_block(f_temp)
else
call self%copy_f_to_data(data, f)
end if

end subroutine get_field_data

subroutine set_field_data(self, f, data, dir)
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: f !! Field
real(dp), dimension(:, :, :), intent(in) :: data !! Input array
integer, optional, intent(in) :: dir !! Orientation of input array (defaults to Cartesian)

class(field_t), pointer :: f_temp
integer :: direction, rdr_dir

if (present(dir)) then
direction = dir
else
direction = DIR_C
end if

! Returns 0 if no reorder required
rdr_dir = get_rdr_from_dirs(f%dir, direction)

! Carry out a reorder if we need, and copy from data array to field
if (rdr_dir /= 0) then
f_temp => self%allocator%get_block(direction)
call self%copy_data_to_f(f_temp, data)
call self%reorder(f, f_temp, rdr_dir)
call self%allocator%release_block(f_temp)
else
call self%copy_data_to_f(f, data)
end if

end subroutine set_field_data

end module m_base_backend
17 changes: 16 additions & 1 deletion src/common.f90
Expand Up @@ -5,10 +5,18 @@ module m_common
real(dp), parameter :: pi = 4*atan(1.0_dp)

integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32
RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, &
RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, &
RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34
integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4
integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1

integer, protected :: &
rdr_map(4, 4) = reshape([0, RDR_X2Y, RDR_X2Z, RDR_X2C, &
RDR_Y2X, 0, RDR_Y2Z, RDR_Y2C, &
RDR_Z2X, RDR_Z2Y, 0, RDR_Z2C, &
RDR_C2X, RDR_C2Y, RDR_C2Z, 0], shape=[4, 4])

type :: globs_t
integer :: nx, ny, nz
integer :: nx_loc, ny_loc, nz_loc
Expand Down Expand Up @@ -79,4 +87,11 @@ subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, &

end subroutine set_pprev_pnext

integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir)
!! Returns RDR_?2? value based on two direction inputs
integer, intent(in) :: dir_from, dir_to

rdr_dir = rdr_map(dir_from, dir_to)
end function get_rdr_from_dirs

end module m_common
114 changes: 87 additions & 27 deletions src/cuda/backend.f90
Expand Up @@ -6,7 +6,9 @@ module m_cuda_backend
use m_allocator, only: allocator_t, field_t
use m_base_backend, only: base_backend_t
use m_common, only: dp, globs_t, &
RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y
RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, &
RDR_C2X, RDR_C2Y, RDR_C2Z, RDR_X2C, RDR_Y2C, RDR_Z2C, &
DIR_X, DIR_Y, DIR_Z, DIR_C
use m_poisson_fft, only: poisson_fft_t
use m_tdsops, only: dirps_t, tdsops_t

Expand All @@ -19,7 +21,8 @@ module m_cuda_backend
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
use m_cuda_kernels_reorder, only: &
reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy
reorder_z2y, reorder_c2x, reorder_x2c, &
sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy

implicit none

Expand Down Expand Up @@ -47,8 +50,8 @@ module m_cuda_backend
procedure :: sum_zintox => sum_zintox_cuda
procedure :: vecadd => vecadd_cuda
procedure :: scalar_product => scalar_product_cuda
procedure :: set_field => set_field_cuda
procedure :: get_field => get_field_cuda
procedure :: copy_data_to_f => copy_data_to_f_cuda
procedure :: copy_f_to_data => copy_f_to_data_cuda
procedure :: init_poisson_fft => init_cuda_poisson_fft
procedure :: transeq_cuda_dist
procedure :: transeq_cuda_thom
Expand Down Expand Up @@ -417,39 +420,102 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
class(field_t), intent(in) :: u_i
integer, intent(in) :: direction

real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d
real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d, u_temp_d
class(field_t), pointer :: u_temp
type(dim3) :: blocks, threads

call resolve_field_t(u_o_d, u_o)
call resolve_field_t(u_i_d, u_i)

select case (direction)
case (RDR_X2Y) ! x2y
case (RDR_X2Y)
blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
threads = dim3(SZ, SZ, 1)
call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_X2Z) ! x2z
case (RDR_X2Z)
blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
threads = dim3(SZ, 1, 1)
call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_Y2X) ! y2x
case (RDR_Y2X)
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_Y2Z) ! y2z
case (RDR_Y2Z)
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
self%nx_loc, self%nz_loc)
case (RDR_Z2X) ! z2x
case (RDR_Z2X)
blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
threads = dim3(SZ, 1, 1)
call reorder_z2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_Z2Y) ! z2y
case (RDR_Z2Y)
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
self%nx_loc, self%nz_loc)
case (RDR_C2X)
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_c2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_C2Y)
! First reorder from C to X, then from X to Y
u_temp => self%allocator%get_block(DIR_X)
call resolve_field_t(u_temp_d, u_temp)

blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_c2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)

blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
threads = dim3(SZ, SZ, 1)
call reorder_x2y<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)

call self%allocator%release_block(u_temp)
case (RDR_C2Z)
! First reorder from C to X, then from X to Z
u_temp => self%allocator%get_block(DIR_X)
call resolve_field_t(u_temp_d, u_temp)

blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_c2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)

blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
threads = dim3(SZ, 1, 1)
call reorder_x2z<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)

call self%allocator%release_block(u_temp)
case (RDR_X2C)
blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_x2c<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
case (RDR_Y2C)
! First reorder from Y to X, then from X to C
u_temp => self%allocator%get_block(DIR_X)
call resolve_field_t(u_temp_d, u_temp)

blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_y2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)

call reorder_x2c<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)

call self%allocator%release_block(u_temp)
case (RDR_Z2C)
! First reorder from Z to X, then from X to C
u_temp => self%allocator%get_block(DIR_X)
call resolve_field_t(u_temp_d, u_temp)

blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
threads = dim3(SZ, 1, 1)
call reorder_z2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)

blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
threads = dim3(SZ, SZ, 1)
call reorder_x2c<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)

call self%allocator%release_block(u_temp)
case default
error stop 'Reorder direction is undefined.'
end select
Expand Down Expand Up @@ -564,27 +630,21 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)

end subroutine copy_into_buffers

subroutine set_field_cuda(self, f, arr)
implicit none

class(cuda_backend_t) :: self
subroutine copy_data_to_f_cuda(self, f, data)
class(cuda_backend_t), intent(inout) :: self
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: arr

select type(f); type is (cuda_field_t); f%data_d = arr; end select
real(dp), dimension(:, :, :), intent(inout) :: data

end subroutine set_field_cuda
select type(f); type is (cuda_field_t); f%data_d = data; end select
end subroutine copy_data_to_f_cuda

subroutine get_field_cuda(self, arr, f)
implicit none

class(cuda_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: arr
subroutine copy_f_to_data_cuda(self, data, f)
class(cuda_backend_t), intent(inout) :: self
real(dp), dimension(:, :, :), intent(out) :: data
class(field_t), intent(in) :: f

select type(f); type is (cuda_field_t); arr = f%data_d; end select

end subroutine get_field_cuda
select type(f); type is (cuda_field_t); data = f%data_d; end select
end subroutine copy_f_to_data_cuda

subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps)
implicit none
Expand Down