Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for scalar product operator #67

Merged
merged 6 commits into from Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
36 changes: 25 additions & 11 deletions src/backend.f90
Expand Up @@ -33,8 +33,9 @@ module m_base_backend
procedure(sum_intox), deferred :: sum_yintox
procedure(sum_intox), deferred :: sum_zintox
procedure(vecadd), deferred :: vecadd
procedure(get_fields), deferred :: get_fields
procedure(set_fields), deferred :: set_fields
procedure(scalar_product), deferred :: scalar_product
procedure(get_field), deferred :: get_field
procedure(set_field), deferred :: set_field
procedure(alloc_tdsops), deferred :: alloc_tdsops
end type base_backend_t

Expand Down Expand Up @@ -126,7 +127,20 @@ end subroutine vecadd
end interface

abstract interface
subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
real(dp) function scalar_product(self, x, y) result(s)
!! Calculates the scalar product of two input fields
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(in) :: x, y
end function scalar_product
end interface

abstract interface
subroutine get_field(self, arr, f)
!! copy the specialist data structure from device or host back
!! to a regular 3D data structure.
import :: base_backend_t
Expand All @@ -135,23 +149,23 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
implicit none

class(base_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
class(field_t), intent(in) :: u, v, w
end subroutine get_fields
real(dp), dimension(:, :, :), intent(out) :: arr
class(field_t), intent(in) :: f
end subroutine get_field

subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
subroutine set_field(self, f, arr)
!! copy the initial condition stored in a regular 3D data
!! structure into the specialist data structure arrays on the
!! structure into the specialist data structure array on the
!! device or host.
import :: base_backend_t
import :: dp
import :: field_t
implicit none

class(base_backend_t) :: self
class(field_t), intent(inout) :: u, v, w
real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
end subroutine set_fields
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: arr
end subroutine set_field
end interface

abstract interface
Expand Down
61 changes: 44 additions & 17 deletions src/cuda/backend.f90
@@ -1,6 +1,7 @@
module m_cuda_backend
use iso_fortran_env, only: stderr => error_unit
use cudafor
use mpi

use m_allocator, only: allocator_t, field_t
use m_base_backend, only: base_backend_t
Expand All @@ -16,7 +17,7 @@ module m_cuda_backend
use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
use m_cuda_kernels_reorder, only: &
reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
reorder_z2y, sum_yintox, sum_zintox, axpby, buffer_copy
reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy

implicit none

Expand All @@ -41,8 +42,9 @@ module m_cuda_backend
procedure :: sum_yintox => sum_yintox_cuda
procedure :: sum_zintox => sum_zintox_cuda
procedure :: vecadd => vecadd_cuda
procedure :: set_fields => set_fields_cuda
procedure :: get_fields => get_fields_cuda
procedure :: scalar_product => scalar_product_cuda
procedure :: set_field => set_field_cuda
procedure :: get_field => get_field_cuda
procedure :: transeq_cuda_dist
procedure :: transeq_cuda_thom
procedure :: tds_solve_dist
Expand Down Expand Up @@ -529,6 +531,35 @@ subroutine vecadd_cuda(self, a, x, b, y)

end subroutine vecadd_cuda

real(dp) function scalar_product_cuda(self, x, y) result(s)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(in) :: x, y

real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
real(dp), device, allocatable :: sum_d
type(dim3) :: blocks, threads
integer :: n, ierr

select type(x); type is (cuda_field_t); x_d => x%data_d; end select
select type(y); type is (cuda_field_t); y_d => y%data_d; end select

allocate (sum_d)
sum_d = 0._dp

n = size(x_d, dim = 2)
blocks = dim3(size(x_d, dim = 3), 1, 1)
threads = dim3(SZ, 1, 1)
call scalar_product<<<blocks, threads>>>(sum_d, x_d, y_d, n)

s = sum_d

call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're thinking about mixed/reduced precision, should the precision passed to MPI derive from the actual data type? Not sure what the typical way to do this is in MPI codes. @pbartholomew08, @Nanoseb?

MPI_COMM_WORLD, ierr)

end function scalar_product_cuda

subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
implicit none

Expand All @@ -547,31 +578,27 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)

end subroutine copy_into_buffers

subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in)
subroutine set_field_cuda(self, f, arr)
implicit none

class(cuda_backend_t) :: self
class(field_t), intent(inout) :: u, v, w
real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: arr

select type(u); type is (cuda_field_t); u%data_d = u_in; end select
select type(v); type is (cuda_field_t); v%data_d = v_in; end select
select type(w); type is (cuda_field_t); w%data_d = w_in; end select
select type(f); type is (cuda_field_t); f%data_d = arr; end select

end subroutine set_fields_cuda
end subroutine set_field_cuda

subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w)
subroutine get_field_cuda(self, arr, f)
implicit none

class(cuda_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
class(field_t), intent(in) :: u, v, w
real(dp), dimension(:, :, :), intent(out) :: arr
class(field_t), intent(in) :: f

select type(u); type is (cuda_field_t); u_out = u%data_d; end select
select type(v); type is (cuda_field_t); v_out = v%data_d; end select
select type(w); type is (cuda_field_t); w_out = w%data_d; end select
select type(f); type is (cuda_field_t); arr = f%data_d; end select

end subroutine get_fields_cuda
end subroutine get_field_cuda

end module m_cuda_backend

21 changes: 21 additions & 0 deletions src/cuda/kernels/reorder.f90
Expand Up @@ -199,6 +199,27 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y)

end subroutine axpby

attributes(global) subroutine scalar_product(s, x, y, n)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Caller needs to understand this is pencil sum to set correct n_threads/blocks. This should be documented here.

implicit none

real(dp), device, intent(inout) :: s
real(dp), device, intent(in), dimension(:, :, :) :: x, y
integer, value, intent(in) :: n

real(dp) :: s_pncl !! pencil sum
integer :: i, j, b, ierr

i = threadIdx%x
b = blockIdx%x

s_pncl = 0._dp
do j = 1, n
s_pncl = s_pncl + x(i, j, b)*y(i, j, b)
end do
ierr = atomicadd(s, s_pncl)

end subroutine scalar_product

attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
implicit none

Expand Down
39 changes: 23 additions & 16 deletions src/omp/backend.f90
Expand Up @@ -30,8 +30,9 @@ module m_omp_backend
procedure :: sum_yintox => sum_yintox_omp
procedure :: sum_zintox => sum_zintox_omp
procedure :: vecadd => vecadd_omp
procedure :: set_fields => set_fields_omp
procedure :: get_fields => get_fields_omp
procedure :: scalar_product => scalar_product_omp
procedure :: set_field => set_field_omp
procedure :: get_field => get_field_omp
procedure :: transeq_omp_dist
end type omp_backend_t

Expand Down Expand Up @@ -304,6 +305,16 @@ subroutine vecadd_omp(self, a, x, b, y)

end subroutine vecadd_omp

real(dp) function scalar_product_omp(self, x, y) result(s)
implicit none

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
! TODO incomplete implementation

class(omp_backend_t) :: self
class(field_t), intent(in) :: x, y

s = 0._dp

end function scalar_product_omp

subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
implicit none

Expand All @@ -329,31 +340,27 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)

end subroutine copy_into_buffers

subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in)
subroutine set_field_omp(self, f, arr)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unclear argument names. Perhaps field and data?

implicit none

class(omp_backend_t) :: self
class(field_t), intent(inout) :: u, v, w
real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
class(field_t), intent(inout) :: f
real(dp), dimension(:, :, :), intent(in) :: arr

u%data = u_in
v%data = v_in
w%data = w_in
f%data = arr

end subroutine set_fields_omp
end subroutine set_field_omp

subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w)
subroutine get_field_omp(self, arr, f)
implicit none

class(omp_backend_t) :: self
real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
class(field_t), intent(in) :: u, v, w
real(dp), dimension(:, :, :), intent(out) :: arr
class(field_t), intent(in) :: f

u_out = u%data
v_out = v%data
w_out = w%data
arr = f%data

end subroutine get_fields_omp
end subroutine get_field_omp

end module m_omp_backend

12 changes: 6 additions & 6 deletions src/solver.f90
Expand Up @@ -93,9 +93,9 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
v_init = 0
w_init = 0

call solver%backend%set_fields( &
solver%u, solver%v, solver%w, u_init, v_init, w_init &
)
call solver%backend%set_field(solver%u, u_init)
call solver%backend%set_field(solver%v, v_init)
call solver%backend%set_field(solver%w, w_init)

deallocate(u_init, v_init, w_init)
print*, 'initial conditions are set'
Expand Down Expand Up @@ -548,9 +548,9 @@ subroutine run(self, n_iter, u_out, v_out, w_out)

print*, 'run end'

call self%backend%get_fields( &
u_out, v_out, w_out, self%u, self%v, self%w &
)
call self%backend%get_field(u_out, self%u)
call self%backend%get_field(v_out, self%v)
call self%backend%get_field(w_out, self%w)

end subroutine run

Expand Down