From 9129656c431c6f0482abd52d42f8279fa92b7c98 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 14:22:39 +0000 Subject: [PATCH 1/6] Add scalar_product procedure in backends. --- src/backend.f90 | 15 +++++++++++++++ src/cuda/backend.f90 | 10 ++++++++++ src/omp/backend.f90 | 10 ++++++++++ 3 files changed, 35 insertions(+) diff --git a/src/backend.f90 b/src/backend.f90 index a3958859..f0d01b7d 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -33,6 +33,7 @@ module m_base_backend procedure(sum_intox), deferred :: sum_yintox procedure(sum_intox), deferred :: sum_zintox procedure(vecadd), deferred :: vecadd + procedure(scalar_product), deferred :: scalar_product procedure(get_fields), deferred :: get_fields procedure(set_fields), deferred :: set_fields procedure(alloc_tdsops), deferred :: alloc_tdsops @@ -125,6 +126,20 @@ subroutine vecadd(self, a, x, b, y) end subroutine vecadd end interface + abstract interface + subroutine scalar_product(self, s, x, y) + !! Calculates the scalar product of two input fields + import :: base_backend_t + import :: dp + import :: field_t + implicit none + + class(base_backend_t) :: self + real(dp), intent(out) :: s + class(field_t), intent(in) :: x, y + end subroutine scalar_product + end interface + abstract interface subroutine get_fields(self, u_out, v_out, w_out, u, v, w) !! copy the specialist data structure from device or host back diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index cd4acce4..ce959374 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -41,6 +41,7 @@ module m_cuda_backend procedure :: sum_yintox => sum_yintox_cuda procedure :: sum_zintox => sum_zintox_cuda procedure :: vecadd => vecadd_cuda + procedure :: scalar_product => scalar_product_cuda procedure :: set_fields => set_fields_cuda procedure :: get_fields => get_fields_cuda procedure :: transeq_cuda_dist @@ -529,6 +530,15 @@ subroutine vecadd_cuda(self, a, x, b, y) end subroutine vecadd_cuda + subroutine scalar_product_cuda(self, s, x, y) + implicit none + + class(cuda_backend_t) :: self + real(dp), intent(out) :: s + class(field_t), intent(in) :: x, y + + end subroutine scalar_product_cuda + subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) implicit none diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index dc6f7a1f..58507e6d 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -30,6 +30,7 @@ module m_omp_backend procedure :: sum_yintox => sum_yintox_omp procedure :: sum_zintox => sum_zintox_omp procedure :: vecadd => vecadd_omp + procedure :: scalar_product => scalar_product_omp procedure :: set_fields => set_fields_omp procedure :: get_fields => get_fields_omp procedure :: transeq_omp_dist @@ -304,6 +305,15 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp + subroutine scalar_product_omp(self, s, x, y) + implicit none + + class(omp_backend_t) :: self + real(dp), intent(out) :: s + class(field_t), intent(in) :: x, y + + end subroutine scalar_product_omp + subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) implicit none From 11608d49aac379ea3959c962cbea62b2fecd90bb Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 14:23:50 +0000 Subject: [PATCH 2/6] Implement a scalar product kernel in CUDA backend. --- src/cuda/kernels/reorder.f90 | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90 index 52c093c5..11c74cac 100644 --- a/src/cuda/kernels/reorder.f90 +++ b/src/cuda/kernels/reorder.f90 @@ -199,6 +199,27 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y) end subroutine axpby + attributes(global) subroutine scalar_product(s, x, y, n) + implicit none + + real(dp), device, intent(inout) :: s + real(dp), device, intent(in), dimension(:, :, :) :: x, y + integer, value, intent(in) :: n + + real(dp) :: s_pncl !! pencil sum + integer :: i, j, b, ierr + + i = threadIdx%x + b = blockIdx%x + + s_pncl = 0._dp + do j = 1, n + s_pncl = s_pncl + x(i, j, b)*y(i, j, b) + end do + ierr = atomicadd(s, s_pncl) + + end subroutine scalar_product + attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo) implicit none From ce7b3f8983c3f37521b8206740e47137c150a86b Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 14:25:39 +0000 Subject: [PATCH 3/6] Call the scalar product kernel in CUDA backend. --- src/cuda/backend.f90 | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index ce959374..5e1cdd4f 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -16,7 +16,7 @@ module m_cuda_backend use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs use m_cuda_kernels_reorder, only: & reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, & - reorder_z2y, sum_yintox, sum_zintox, axpby, buffer_copy + reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy implicit none @@ -537,6 +537,24 @@ subroutine scalar_product_cuda(self, s, x, y) real(dp), intent(out) :: s class(field_t), intent(in) :: x, y + real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d + real(dp), device, allocatable :: sum_d + type(dim3) :: blocks, threads + integer :: nx + + select type(x); type is (cuda_field_t); x_d => x%data_d; end select + select type(y); type is (cuda_field_t); y_d => y%data_d; end select + + allocate (sum_d) + sum_d = 0._dp + + nx = size(x_d, dim = 2) + blocks = dim3(size(x_d, dim = 3), 1, 1) + threads = dim3(SZ, 1, 1) + call scalar_product<<>>(sum_d, x_d, y_d, nx) + + s = sum_d + end subroutine scalar_product_cuda subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) From 030f382f8869dd35100ead57eed633c9c2cf4a77 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Fri, 1 Mar 2024 14:56:28 +0000 Subject: [PATCH 4/6] Simplify get/set_fields so that they work on a single field. --- src/backend.f90 | 22 +++++++++++----------- src/cuda/backend.f90 | 28 ++++++++++++---------------- src/omp/backend.f90 | 28 ++++++++++++---------------- src/solver.f90 | 12 ++++++------ 4 files changed, 41 insertions(+), 49 deletions(-) diff --git a/src/backend.f90 b/src/backend.f90 index f0d01b7d..6326ca8e 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -34,8 +34,8 @@ module m_base_backend procedure(sum_intox), deferred :: sum_zintox procedure(vecadd), deferred :: vecadd procedure(scalar_product), deferred :: scalar_product - procedure(get_fields), deferred :: get_fields - procedure(set_fields), deferred :: set_fields + procedure(get_field), deferred :: get_field + procedure(set_field), deferred :: set_field procedure(alloc_tdsops), deferred :: alloc_tdsops end type base_backend_t @@ -141,7 +141,7 @@ end subroutine scalar_product end interface abstract interface - subroutine get_fields(self, u_out, v_out, w_out, u, v, w) + subroutine get_field(self, arr, f) !! copy the specialist data structure from device or host back !! to a regular 3D data structure. import :: base_backend_t @@ -150,13 +150,13 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w) implicit none class(base_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w - end subroutine get_fields + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f + end subroutine get_field - subroutine set_fields(self, u, v, w, u_in, v_in, w_in) + subroutine set_field(self, f, arr) !! copy the initial condition stored in a regular 3D data - !! structure into the specialist data structure arrays on the + !! structure into the specialist data structure array on the !! device or host. import :: base_backend_t import :: dp @@ -164,9 +164,9 @@ subroutine set_fields(self, u, v, w, u_in, v_in, w_in) implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in - end subroutine set_fields + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr + end subroutine set_field end interface abstract interface diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 5e1cdd4f..4a47ed92 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -42,8 +42,8 @@ module m_cuda_backend procedure :: sum_zintox => sum_zintox_cuda procedure :: vecadd => vecadd_cuda procedure :: scalar_product => scalar_product_cuda - procedure :: set_fields => set_fields_cuda - procedure :: get_fields => get_fields_cuda + procedure :: set_field => set_field_cuda + procedure :: get_field => get_field_cuda procedure :: transeq_cuda_dist procedure :: transeq_cuda_thom procedure :: tds_solve_dist @@ -575,31 +575,27 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) end subroutine copy_into_buffers - subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in) + subroutine set_field_cuda(self, f, arr) implicit none class(cuda_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr - select type(u); type is (cuda_field_t); u%data_d = u_in; end select - select type(v); type is (cuda_field_t); v%data_d = v_in; end select - select type(w); type is (cuda_field_t); w%data_d = w_in; end select + select type(f); type is (cuda_field_t); f%data_d = arr; end select - end subroutine set_fields_cuda + end subroutine set_field_cuda - subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w) + subroutine get_field_cuda(self, arr, f) implicit none class(cuda_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f - select type(u); type is (cuda_field_t); u_out = u%data_d; end select - select type(v); type is (cuda_field_t); v_out = v%data_d; end select - select type(w); type is (cuda_field_t); w_out = w%data_d; end select + select type(f); type is (cuda_field_t); arr = f%data_d; end select - end subroutine get_fields_cuda + end subroutine get_field_cuda end module m_cuda_backend diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 58507e6d..64ab5c00 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -31,8 +31,8 @@ module m_omp_backend procedure :: sum_zintox => sum_zintox_omp procedure :: vecadd => vecadd_omp procedure :: scalar_product => scalar_product_omp - procedure :: set_fields => set_fields_omp - procedure :: get_fields => get_fields_omp + procedure :: set_field => set_field_omp + procedure :: get_field => get_field_omp procedure :: transeq_omp_dist end type omp_backend_t @@ -339,31 +339,27 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) end subroutine copy_into_buffers - subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in) + subroutine set_field_omp(self, f, arr) implicit none class(omp_backend_t) :: self - class(field_t), intent(inout) :: u, v, w - real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: arr - u%data = u_in - v%data = v_in - w%data = w_in + f%data = arr - end subroutine set_fields_omp + end subroutine set_field_omp - subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w) + subroutine get_field_omp(self, arr, f) implicit none class(omp_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out - class(field_t), intent(in) :: u, v, w + real(dp), dimension(:, :, :), intent(out) :: arr + class(field_t), intent(in) :: f - u_out = u%data - v_out = v%data - w_out = w%data + arr = f%data - end subroutine get_fields_omp + end subroutine get_field_omp end module m_omp_backend diff --git a/src/solver.f90 b/src/solver.f90 index 60eead1c..fad0a715 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -93,9 +93,9 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & v_init = 0 w_init = 0 - call solver%backend%set_fields( & - solver%u, solver%v, solver%w, u_init, v_init, w_init & - ) + call solver%backend%set_field(solver%u, u_init) + call solver%backend%set_field(solver%v, v_init) + call solver%backend%set_field(solver%w, w_init) deallocate(u_init, v_init, w_init) print*, 'initial conditions are set' @@ -548,9 +548,9 @@ subroutine run(self, n_iter, u_out, v_out, w_out) print*, 'run end' - call self%backend%get_fields( & - u_out, v_out, w_out, self%u, self%v, self%w & - ) + call self%backend%get_field(u_out, self%u) + call self%backend%get_field(v_out, self%v) + call self%backend%get_field(w_out, self%w) end subroutine run From ed12712a9650016a1cce8af6deb463e854393ec4 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 4 Mar 2024 07:56:32 +0000 Subject: [PATCH 5/6] Convert scalar_product subroutine to a function. --- src/backend.f90 | 5 ++--- src/cuda/backend.f90 | 5 ++--- src/omp/backend.f90 | 7 ++++--- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/backend.f90 b/src/backend.f90 index 6326ca8e..cf453dad 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -127,7 +127,7 @@ end subroutine vecadd end interface abstract interface - subroutine scalar_product(self, s, x, y) + real(dp) function scalar_product(self, x, y) result(s) !! Calculates the scalar product of two input fields import :: base_backend_t import :: dp @@ -135,9 +135,8 @@ subroutine scalar_product(self, s, x, y) implicit none class(base_backend_t) :: self - real(dp), intent(out) :: s class(field_t), intent(in) :: x, y - end subroutine scalar_product + end function scalar_product end interface abstract interface diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 4a47ed92..f41b9c02 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -530,11 +530,10 @@ subroutine vecadd_cuda(self, a, x, b, y) end subroutine vecadd_cuda - subroutine scalar_product_cuda(self, s, x, y) + real(dp) function scalar_product_cuda(self, x, y) result(s) implicit none class(cuda_backend_t) :: self - real(dp), intent(out) :: s class(field_t), intent(in) :: x, y real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d @@ -555,7 +554,7 @@ subroutine scalar_product_cuda(self, s, x, y) s = sum_d - end subroutine scalar_product_cuda + end function scalar_product_cuda subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) implicit none diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 64ab5c00..aca287bd 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -305,14 +305,15 @@ subroutine vecadd_omp(self, a, x, b, y) end subroutine vecadd_omp - subroutine scalar_product_omp(self, s, x, y) + real(dp) function scalar_product_omp(self, x, y) result(s) implicit none class(omp_backend_t) :: self - real(dp), intent(out) :: s class(field_t), intent(in) :: x, y - end subroutine scalar_product_omp + s = 0._dp + + end function scalar_product_omp subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) implicit none From ce9289fd320eef992d68d2231195930badce2d18 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Mon, 4 Mar 2024 08:14:40 +0000 Subject: [PATCH 6/6] Make scalar_product function work on multiple ranks. --- src/cuda/backend.f90 | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index f41b9c02..1097d574 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -1,6 +1,7 @@ module m_cuda_backend use iso_fortran_env, only: stderr => error_unit use cudafor + use mpi use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t @@ -539,7 +540,7 @@ real(dp) function scalar_product_cuda(self, x, y) result(s) real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d real(dp), device, allocatable :: sum_d type(dim3) :: blocks, threads - integer :: nx + integer :: n, ierr select type(x); type is (cuda_field_t); x_d => x%data_d; end select select type(y); type is (cuda_field_t); y_d => y%data_d; end select @@ -547,13 +548,16 @@ real(dp) function scalar_product_cuda(self, x, y) result(s) allocate (sum_d) sum_d = 0._dp - nx = size(x_d, dim = 2) + n = size(x_d, dim = 2) blocks = dim3(size(x_d, dim = 3), 1, 1) threads = dim3(SZ, 1, 1) - call scalar_product<<>>(sum_d, x_d, y_d, nx) + call scalar_product<<>>(sum_d, x_d, y_d, n) s = sum_d + call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, & + MPI_COMM_WORLD, ierr) + end function scalar_product_cuda subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)