diff --git a/src/backend.f90 b/src/backend.f90 index a3958859..cf453dad 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -33,8 +33,9 @@ module m_base_backend procedure(sum_intox), deferred :: sum_yintox procedure(sum_intox), deferred :: sum_zintox procedure(vecadd), deferred :: vecadd - procedure(get_fields), deferred :: get_fields - procedure(set_fields), deferred :: set_fields + procedure(scalar_product), deferred :: scalar_product + procedure(get_field), deferred :: get_field + procedure(set_field), deferred :: set_field procedure(alloc_tdsops), deferred :: alloc_tdsops end type base_backend_t @@ -126,7 +127,20 @@ end subroutine vecadd end interface abstract interface - subroutine get_fields(self, u_out, v_out, w_out, u, v, w) + real(dp) function scalar_product(self, x, y) result(s) + !! Calculates the scalar product of two input fields + import :: base_backend_t + import :: dp + import :: field_t + implicit none + + class(base_backend_t) :: self + class(field_t), intent(in) :: x, y + end function scalar_product + end interface + + abstract interface + subroutine get_field(self, arr, f) !! copy the specialist data structure from device or host back !! to a regular 3D data structure. import :: base_backend_t @@ -135,13 +149,13 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w) implicit none class(base_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w - end subroutine get_fields + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f + end subroutine get_field - subroutine set_fields(self, u, v, w, u_in, v_in, w_in) + subroutine set_field(self, f, arr) !! copy the initial condition stored in a regular 3D data - !! structure into the specialist data structure arrays on the + !! structure into the specialist data structure array on the !! device or host. import :: base_backend_t import :: dp @@ -149,9 +163,9 @@ subroutine set_fields(self, u, v, w, u_in, v_in, w_in) implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in - end subroutine set_fields + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr + end subroutine set_field end interface abstract interface diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index cd4acce4..1097d574 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -1,6 +1,7 @@ module m_cuda_backend use iso_fortran_env, only: stderr => error_unit use cudafor + use mpi use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t @@ -16,7 +17,7 @@ module m_cuda_backend use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs use m_cuda_kernels_reorder, only: & reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, & - reorder_z2y, sum_yintox, sum_zintox, axpby, buffer_copy + reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy implicit none @@ -41,8 +42,9 @@ module m_cuda_backend procedure :: sum_yintox => sum_yintox_cuda procedure :: sum_zintox => sum_zintox_cuda procedure :: vecadd => vecadd_cuda - procedure :: set_fields => set_fields_cuda - procedure :: get_fields => get_fields_cuda + procedure :: scalar_product => scalar_product_cuda + procedure :: set_field => set_field_cuda + procedure :: get_field => get_field_cuda procedure :: transeq_cuda_dist procedure :: transeq_cuda_thom procedure :: tds_solve_dist @@ -529,6 +531,35 @@ subroutine vecadd_cuda(self, a, x, b, y) end subroutine vecadd_cuda + real(dp) function scalar_product_cuda(self, x, y) result(s) + implicit none + + class(cuda_backend_t) :: self + class(field_t), intent(in) :: x, y + + real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d + real(dp), device, allocatable :: sum_d + type(dim3) :: blocks, threads + integer :: n, ierr + + select type(x); type is (cuda_field_t); x_d => x%data_d; end select + select type(y); type is (cuda_field_t); y_d => y%data_d; end select + + allocate (sum_d) + sum_d = 0._dp + + n = size(x_d, dim = 2) + blocks = dim3(size(x_d, dim = 3), 1, 1) + threads = dim3(SZ, 1, 1) + call scalar_product<<>>(sum_d, x_d, y_d, n) + + s = sum_d + + call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, & + MPI_COMM_WORLD, ierr) + + end function scalar_product_cuda + subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) implicit none @@ -547,31 +578,27 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) end subroutine copy_into_buffers - subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in) + subroutine set_field_cuda(self, f, arr) implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr - select type(u); type is (cuda_field_t); u%data_d = u_in; end select - select type(v); type is (cuda_field_t); v%data_d = v_in; end select - select type(w); type is (cuda_field_t); w%data_d = w_in; end select + select type(f); type is (cuda_field_t); f%data_d = arr; end select - end subroutine set_fields_cuda + end subroutine set_field_cuda - subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w) + subroutine get_field_cuda(self, arr, f) implicit none class(cuda_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f - select type(u); type is (cuda_field_t); u_out = u%data_d; end select - select type(v); type is (cuda_field_t); v_out = v%data_d; end select - select type(w); type is (cuda_field_t); w_out = w%data_d; end select + select type(f); type is (cuda_field_t); arr = f%data_d; end select - end subroutine get_fields_cuda + end subroutine get_field_cuda end module m_cuda_backend diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90 index 52c093c5..11c74cac 100644 --- a/src/cuda/kernels/reorder.f90 +++ b/src/cuda/kernels/reorder.f90 @@ -199,6 +199,27 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y) end subroutine axpby + attributes(global) subroutine scalar_product(s, x, y, n) + implicit none + + real(dp), device, intent(inout) :: s + real(dp), device, intent(in), dimension(:, :, :) :: x, y + integer, value, intent(in) :: n + + real(dp) :: s_pncl !! pencil sum + integer :: i, j, b, ierr + + i = threadIdx%x + b = blockIdx%x + + s_pncl = 0._dp + do j = 1, n + s_pncl = s_pncl + x(i, j, b)*y(i, j, b) + end do + ierr = atomicadd(s, s_pncl) + + end subroutine scalar_product + attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo) implicit none diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index dc6f7a1f..aca287bd 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -30,8 +30,9 @@ module m_omp_backend procedure :: sum_yintox => sum_yintox_omp procedure :: sum_zintox => sum_zintox_omp procedure :: vecadd => vecadd_omp - procedure :: set_fields => set_fields_omp - procedure :: get_fields => get_fields_omp + procedure :: scalar_product => scalar_product_omp + procedure :: set_field => set_field_omp + procedure :: get_field => get_field_omp procedure :: transeq_omp_dist end type omp_backend_t @@ -304,6 +305,16 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp + real(dp) function scalar_product_omp(self, x, y) result(s) + implicit none + + class(omp_backend_t) :: self + class(field_t), intent(in) :: x, y + + s = 0._dp + + end function scalar_product_omp + subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) implicit none @@ -329,31 +340,27 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) end subroutine copy_into_buffers - subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in) + subroutine set_field_omp(self, f, arr) implicit none class(omp_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr - u%data = u_in - v%data = v_in - w%data = w_in + f%data = arr - end subroutine set_fields_omp + end subroutine set_field_omp - subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w) + subroutine get_field_omp(self, arr, f) implicit none class(omp_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f - u_out = u%data - v_out = v%data - w_out = w%data + arr = f%data - end subroutine get_fields_omp + end subroutine get_field_omp end module m_omp_backend diff --git a/src/solver.f90 b/src/solver.f90 index 60eead1c..fad0a715 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -93,9 +93,9 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & v_init = 0 w_init = 0 - call solver%backend%set_fields( & - solver%u, solver%v, solver%w, u_init, v_init, w_init & - ) + call solver%backend%set_field(solver%u, u_init) + call solver%backend%set_field(solver%v, v_init) + call solver%backend%set_field(solver%w, w_init) deallocate(u_init, v_init, w_init) print*, 'initial conditions are set' @@ -548,9 +548,9 @@ subroutine run(self, n_iter, u_out, v_out, w_out) print*, 'run end' - call self%backend%get_fields( & - u_out, v_out, w_out, self%u, self%v, self%w & - ) + call self%backend%get_field(u_out, self%u) + call self%backend%get_field(v_out, self%v) + call self%backend%get_field(w_out, self%w) end subroutine run