From 9129656c431c6f0482abd52d42f8279fa92b7c98 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Fri, 1 Mar 2024 14:22:39 +0000
Subject: [PATCH 1/6] Add scalar_product procedure in backends.

---
 src/backend.f90      | 15 +++++++++++++++
 src/cuda/backend.f90 | 10 ++++++++++
 src/omp/backend.f90  | 10 ++++++++++
 3 files changed, 35 insertions(+)

diff --git a/src/backend.f90 b/src/backend.f90
index a3958859..f0d01b7d 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -33,6 +33,7 @@ module m_base_backend
       procedure(sum_intox), deferred :: sum_yintox
       procedure(sum_intox), deferred :: sum_zintox
       procedure(vecadd), deferred :: vecadd
+      procedure(scalar_product), deferred :: scalar_product
       procedure(get_fields), deferred :: get_fields
       procedure(set_fields), deferred :: set_fields
       procedure(alloc_tdsops), deferred :: alloc_tdsops
@@ -125,6 +126,20 @@ subroutine vecadd(self, a, x, b, y)
       end subroutine vecadd
    end interface
 
+   abstract interface
+      subroutine scalar_product(self, s, x, y)
+         !! Calculates the scalar product of two input fields
+         import :: base_backend_t
+         import :: dp
+         import :: field_t
+         implicit none
+
+         class(base_backend_t) :: self
+         real(dp), intent(out) :: s
+         class(field_t), intent(in) :: x, y
+      end subroutine scalar_product
+   end interface
+
    abstract interface
       subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
          !! copy the specialist data structure from device or host back
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index cd4acce4..ce959374 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -41,6 +41,7 @@ module m_cuda_backend
       procedure :: sum_yintox => sum_yintox_cuda
       procedure :: sum_zintox => sum_zintox_cuda
       procedure :: vecadd => vecadd_cuda
+      procedure :: scalar_product => scalar_product_cuda
       procedure :: set_fields => set_fields_cuda
       procedure :: get_fields => get_fields_cuda
       procedure :: transeq_cuda_dist
@@ -529,6 +530,15 @@ subroutine vecadd_cuda(self, a, x, b, y)
 
    end subroutine vecadd_cuda
 
+   subroutine scalar_product_cuda(self, s, x, y)
+      implicit none
+
+      class(cuda_backend_t) :: self
+      real(dp), intent(out) :: s
+      class(field_t), intent(in) :: x, y
+
+   end subroutine scalar_product_cuda
+
    subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
       implicit none
 
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index dc6f7a1f..58507e6d 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -30,6 +30,7 @@ module m_omp_backend
       procedure :: sum_yintox => sum_yintox_omp
       procedure :: sum_zintox => sum_zintox_omp
       procedure :: vecadd => vecadd_omp
+      procedure :: scalar_product => scalar_product_omp
       procedure :: set_fields => set_fields_omp
       procedure :: get_fields => get_fields_omp
       procedure :: transeq_omp_dist
@@ -304,6 +305,15 @@ subroutine vecadd_omp(self, a, x, b, y)
 
    end subroutine vecadd_omp
 
+   subroutine scalar_product_omp(self, s, x, y)
+      implicit none
+
+      class(omp_backend_t) :: self
+      real(dp), intent(out) :: s
+      class(field_t), intent(in) :: x, y
+
+   end subroutine scalar_product_omp
+
    subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
       implicit none
 

From 11608d49aac379ea3959c962cbea62b2fecd90bb Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Fri, 1 Mar 2024 14:23:50 +0000
Subject: [PATCH 2/6] Implement a scalar product kernel in CUDA backend.

---
 src/cuda/kernels/reorder.f90 | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90
index 52c093c5..11c74cac 100644
--- a/src/cuda/kernels/reorder.f90
+++ b/src/cuda/kernels/reorder.f90
@@ -199,6 +199,27 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y)
 
    end subroutine axpby
 
+   attributes(global) subroutine scalar_product(s, x, y, n)
+      implicit none
+
+      real(dp), device, intent(inout) :: s
+      real(dp), device, intent(in), dimension(:, :, :) :: x, y
+      integer, value, intent(in) :: n
+
+      real(dp) :: s_pncl !! pencil sum
+      integer :: i, j, b, ierr
+
+      i = threadIdx%x
+      b = blockIdx%x
+
+      s_pncl = 0._dp
+      do j = 1, n
+         s_pncl = s_pncl + x(i, j, b)*y(i, j, b)
+      end do
+      ierr = atomicadd(s, s_pncl)
+
+   end subroutine scalar_product
+
    attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
       implicit none
 

From ce7b3f8983c3f37521b8206740e47137c150a86b Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Fri, 1 Mar 2024 14:25:39 +0000
Subject: [PATCH 3/6] Call the scalar product kernel in CUDA backend.

---
 src/cuda/backend.f90 | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index ce959374..5e1cdd4f 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -16,7 +16,7 @@ module m_cuda_backend
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
    use m_cuda_kernels_reorder, only: &
        reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
-       reorder_z2y, sum_yintox, sum_zintox, axpby, buffer_copy
+       reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy
 
    implicit none
 
@@ -537,6 +537,24 @@ subroutine scalar_product_cuda(self, s, x, y)
       real(dp), intent(out) :: s
       class(field_t), intent(in) :: x, y
 
+      real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
+      real(dp), device, allocatable :: sum_d
+      type(dim3) :: blocks, threads
+      integer :: nx
+
+      select type(x); type is (cuda_field_t); x_d => x%data_d; end select
+      select type(y); type is (cuda_field_t); y_d => y%data_d; end select
+
+      allocate (sum_d)
+      sum_d = 0._dp
+
+      nx = size(x_d, dim = 2)
+      blocks = dim3(size(x_d, dim = 3), 1, 1)
+      threads = dim3(SZ, 1, 1)
+      call scalar_product<<<blocks, threads>>>(sum_d, x_d, y_d, nx)
+
+      s = sum_d
+
    end subroutine scalar_product_cuda
 
    subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)

From 030f382f8869dd35100ead57eed633c9c2cf4a77 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Fri, 1 Mar 2024 14:56:28 +0000
Subject: [PATCH 4/6] Simplify get/set_fields so that they work on a single
 field.

---
 src/backend.f90      | 22 +++++++++++-----------
 src/cuda/backend.f90 | 28 ++++++++++++----------------
 src/omp/backend.f90  | 28 ++++++++++++----------------
 src/solver.f90       | 12 ++++++------
 4 files changed, 41 insertions(+), 49 deletions(-)

diff --git a/src/backend.f90 b/src/backend.f90
index f0d01b7d..6326ca8e 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -34,8 +34,8 @@ module m_base_backend
       procedure(sum_intox), deferred :: sum_zintox
       procedure(vecadd), deferred :: vecadd
       procedure(scalar_product), deferred :: scalar_product
-      procedure(get_fields), deferred :: get_fields
-      procedure(set_fields), deferred :: set_fields
+      procedure(get_field), deferred :: get_field
+      procedure(set_field), deferred :: set_field
       procedure(alloc_tdsops), deferred :: alloc_tdsops
    end type base_backend_t
 
@@ -141,7 +141,7 @@ end subroutine scalar_product
    end interface
 
    abstract interface
-      subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
+      subroutine get_field(self, arr, f)
          !! copy the specialist data structure from device or host back
          !! to a regular 3D data structure.
          import :: base_backend_t
@@ -150,13 +150,13 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
          implicit none
 
          class(base_backend_t) :: self
-         real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-         class(field_t), intent(in) :: u, v, w
-      end subroutine get_fields
+         real(dp), dimension(:, :, :), intent(out) :: arr
+         class(field_t), intent(in) :: f
+      end subroutine get_field
 
-      subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
+      subroutine set_field(self, f, arr)
          !! copy the initial condition stored in a regular 3D data
-         !! structure into the specialist data structure arrays on the
+         !! structure into the specialist data structure array on the
          !! device or host.
          import :: base_backend_t
          import :: dp
@@ -164,9 +164,9 @@ subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
          implicit none
 
          class(base_backend_t) :: self
-         class(field_t), intent(inout) :: u, v, w
-         real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
-      end subroutine set_fields
+         class(field_t), intent(inout) :: f
+         real(dp), dimension(:, :, :), intent(in) :: arr
+      end subroutine set_field
    end interface
 
    abstract interface
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 5e1cdd4f..4a47ed92 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -42,8 +42,8 @@ module m_cuda_backend
       procedure :: sum_zintox => sum_zintox_cuda
       procedure :: vecadd => vecadd_cuda
       procedure :: scalar_product => scalar_product_cuda
-      procedure :: set_fields => set_fields_cuda
-      procedure :: get_fields => get_fields_cuda
+      procedure :: set_field => set_field_cuda
+      procedure :: get_field => get_field_cuda
       procedure :: transeq_cuda_dist
       procedure :: transeq_cuda_thom
       procedure :: tds_solve_dist
@@ -575,31 +575,27 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
 
    end subroutine copy_into_buffers
 
-   subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in)
+   subroutine set_field_cuda(self, f, arr)
       implicit none
 
       class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u, v, w
-      real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+      class(field_t), intent(inout) :: f
+      real(dp), dimension(:, :, :), intent(in) :: arr
 
-      select type(u); type is (cuda_field_t); u%data_d = u_in; end select
-      select type(v); type is (cuda_field_t); v%data_d = v_in; end select
-      select type(w); type is (cuda_field_t); w%data_d = w_in; end select
+      select type(f); type is (cuda_field_t); f%data_d = arr; end select
 
-   end subroutine set_fields_cuda
+   end subroutine set_field_cuda
 
-   subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w)
+   subroutine get_field_cuda(self, arr, f)
       implicit none
 
       class(cuda_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-      class(field_t), intent(in) :: u, v, w
+      real(dp), dimension(:, :, :), intent(out) :: arr
+      class(field_t), intent(in) :: f
 
-      select type(u); type is (cuda_field_t); u_out = u%data_d; end select
-      select type(v); type is (cuda_field_t); v_out = v%data_d; end select
-      select type(w); type is (cuda_field_t); w_out = w%data_d; end select
+      select type(f); type is (cuda_field_t); arr = f%data_d; end select
 
-   end subroutine get_fields_cuda
+   end subroutine get_field_cuda
 
 end module m_cuda_backend
 
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 58507e6d..64ab5c00 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -31,8 +31,8 @@ module m_omp_backend
       procedure :: sum_zintox => sum_zintox_omp
       procedure :: vecadd => vecadd_omp
       procedure :: scalar_product => scalar_product_omp
-      procedure :: set_fields => set_fields_omp
-      procedure :: get_fields => get_fields_omp
+      procedure :: set_field => set_field_omp
+      procedure :: get_field => get_field_omp
       procedure :: transeq_omp_dist
    end type omp_backend_t
 
@@ -339,31 +339,27 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
 
    end subroutine copy_into_buffers
 
-   subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in)
+   subroutine set_field_omp(self, f, arr)
       implicit none
 
       class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u, v, w
-      real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+      class(field_t), intent(inout) :: f
+      real(dp), dimension(:, :, :), intent(in) :: arr
 
-      u%data = u_in
-      v%data = v_in
-      w%data = w_in
+      f%data = arr
 
-   end subroutine set_fields_omp
+   end subroutine set_field_omp
 
-   subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w)
+   subroutine get_field_omp(self, arr, f)
       implicit none
 
       class(omp_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-      class(field_t), intent(in) :: u, v, w
+      real(dp), dimension(:, :, :), intent(out) :: arr
+      class(field_t), intent(in) :: f
 
-      u_out = u%data
-      v_out = v%data
-      w_out = w%data
+      arr = f%data
 
-   end subroutine get_fields_omp
+   end subroutine get_field_omp
 
 end module m_omp_backend
 
diff --git a/src/solver.f90 b/src/solver.f90
index 60eead1c..fad0a715 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -93,9 +93,9 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       v_init = 0
       w_init = 0
 
-      call solver%backend%set_fields( &
-         solver%u, solver%v, solver%w, u_init, v_init, w_init &
-      )
+      call solver%backend%set_field(solver%u, u_init)
+      call solver%backend%set_field(solver%v, v_init)
+      call solver%backend%set_field(solver%w, w_init)
 
       deallocate(u_init, v_init, w_init)
       print*, 'initial conditions are set'
@@ -548,9 +548,9 @@ subroutine run(self, n_iter, u_out, v_out, w_out)
 
       print*, 'run end'
 
-      call self%backend%get_fields( &
-         u_out, v_out, w_out, self%u, self%v, self%w &
-      )
+      call self%backend%get_field(u_out, self%u)
+      call self%backend%get_field(v_out, self%v)
+      call self%backend%get_field(w_out, self%w)
 
    end subroutine run
 

From ed12712a9650016a1cce8af6deb463e854393ec4 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Mon, 4 Mar 2024 07:56:32 +0000
Subject: [PATCH 5/6] Convert scalar_product subroutine to a function.

---
 src/backend.f90      | 5 ++---
 src/cuda/backend.f90 | 5 ++---
 src/omp/backend.f90  | 7 ++++---
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/src/backend.f90 b/src/backend.f90
index 6326ca8e..cf453dad 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -127,7 +127,7 @@ end subroutine vecadd
    end interface
 
    abstract interface
-      subroutine scalar_product(self, s, x, y)
+      real(dp) function scalar_product(self, x, y) result(s)
          !! Calculates the scalar product of two input fields
          import :: base_backend_t
          import :: dp
@@ -135,9 +135,8 @@ subroutine scalar_product(self, s, x, y)
          implicit none
 
          class(base_backend_t) :: self
-         real(dp), intent(out) :: s
          class(field_t), intent(in) :: x, y
-      end subroutine scalar_product
+      end function scalar_product
    end interface
 
    abstract interface
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 4a47ed92..f41b9c02 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -530,11 +530,10 @@ subroutine vecadd_cuda(self, a, x, b, y)
 
    end subroutine vecadd_cuda
 
-   subroutine scalar_product_cuda(self, s, x, y)
+   real(dp) function scalar_product_cuda(self, x, y) result(s)
       implicit none
 
       class(cuda_backend_t) :: self
-      real(dp), intent(out) :: s
       class(field_t), intent(in) :: x, y
 
       real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
@@ -555,7 +554,7 @@ subroutine scalar_product_cuda(self, s, x, y)
 
       s = sum_d
 
-   end subroutine scalar_product_cuda
+   end function scalar_product_cuda
 
    subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
       implicit none
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 64ab5c00..aca287bd 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -305,14 +305,15 @@ subroutine vecadd_omp(self, a, x, b, y)
 
    end subroutine vecadd_omp
 
-   subroutine scalar_product_omp(self, s, x, y)
+   real(dp) function scalar_product_omp(self, x, y) result(s)
       implicit none
 
       class(omp_backend_t) :: self
-      real(dp), intent(out) :: s
       class(field_t), intent(in) :: x, y
 
-   end subroutine scalar_product_omp
+      s = 0._dp
+
+   end function scalar_product_omp
 
    subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
       implicit none

From ce9289fd320eef992d68d2231195930badce2d18 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Mon, 4 Mar 2024 08:14:40 +0000
Subject: [PATCH 6/6] Make scalar_product function work on multiple ranks.

---
 src/cuda/backend.f90 | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index f41b9c02..1097d574 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -1,6 +1,7 @@
 module m_cuda_backend
    use iso_fortran_env, only: stderr => error_unit
    use cudafor
+   use mpi
 
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
@@ -539,7 +540,7 @@ real(dp) function scalar_product_cuda(self, x, y) result(s)
       real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
       real(dp), device, allocatable :: sum_d
       type(dim3) :: blocks, threads
-      integer :: nx
+      integer :: n, ierr
 
       select type(x); type is (cuda_field_t); x_d => x%data_d; end select
       select type(y); type is (cuda_field_t); y_d => y%data_d; end select
@@ -547,13 +548,16 @@ real(dp) function scalar_product_cuda(self, x, y) result(s)
       allocate (sum_d)
       sum_d = 0._dp
 
-      nx = size(x_d, dim = 2)
+      n = size(x_d, dim = 2)
       blocks = dim3(size(x_d, dim = 3), 1, 1)
       threads = dim3(SZ, 1, 1)
-      call scalar_product<<<blocks, threads>>>(sum_d, x_d, y_d, nx)
+      call scalar_product<<<blocks, threads>>>(sum_d, x_d, y_d, n)
 
       s = sum_d
 
+      call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                         MPI_COMM_WORLD, ierr)
+
    end function scalar_product_cuda
 
    subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)