diff --git a/src/backend.f90 b/src/backend.f90
index a3958859..cf453dad 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -33,8 +33,9 @@ module m_base_backend
       procedure(sum_intox), deferred :: sum_yintox
       procedure(sum_intox), deferred :: sum_zintox
       procedure(vecadd), deferred :: vecadd
-      procedure(get_fields), deferred :: get_fields
-      procedure(set_fields), deferred :: set_fields
+      procedure(scalar_product), deferred :: scalar_product
+      procedure(get_field), deferred :: get_field
+      procedure(set_field), deferred :: set_field
       procedure(alloc_tdsops), deferred :: alloc_tdsops
    end type base_backend_t
 
@@ -126,7 +127,20 @@ end subroutine vecadd
    end interface
 
    abstract interface
-      subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
+      real(dp) function scalar_product(self, x, y) result(s)
+         !! Calculates the scalar product of two input fields
+         import :: base_backend_t
+         import :: dp
+         import :: field_t
+         implicit none
+
+         class(base_backend_t) :: self
+         class(field_t), intent(in) :: x, y
+      end function scalar_product
+   end interface
+
+   abstract interface
+      subroutine get_field(self, arr, f)
          !! copy the specialist data structure from device or host back
          !! to a regular 3D data structure.
          import :: base_backend_t
@@ -135,13 +149,13 @@ subroutine get_fields(self, u_out, v_out, w_out, u, v, w)
          implicit none
 
          class(base_backend_t) :: self
-         real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-         class(field_t), intent(in) :: u, v, w
-      end subroutine get_fields
+         real(dp), dimension(:, :, :), intent(out) :: arr
+         class(field_t), intent(in) :: f
+      end subroutine get_field
 
-      subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
+      subroutine set_field(self, f, arr)
          !! copy the initial condition stored in a regular 3D data
-         !! structure into the specialist data structure arrays on the
+         !! structure into the specialist data structure array on the
          !! device or host.
          import :: base_backend_t
          import :: dp
@@ -149,9 +163,9 @@ subroutine set_fields(self, u, v, w, u_in, v_in, w_in)
          implicit none
 
          class(base_backend_t) :: self
-         class(field_t), intent(inout) :: u, v, w
-         real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
-      end subroutine set_fields
+         class(field_t), intent(inout) :: f
+         real(dp), dimension(:, :, :), intent(in) :: arr
+      end subroutine set_field
    end interface
 
    abstract interface
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index cd4acce4..1097d574 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -1,6 +1,7 @@
 module m_cuda_backend
    use iso_fortran_env, only: stderr => error_unit
    use cudafor
+   use mpi
 
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
@@ -16,7 +17,7 @@ module m_cuda_backend
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
    use m_cuda_kernels_reorder, only: &
        reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
-       reorder_z2y, sum_yintox, sum_zintox, axpby, buffer_copy
+       reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy
 
    implicit none
 
@@ -41,8 +42,9 @@ module m_cuda_backend
       procedure :: sum_yintox => sum_yintox_cuda
       procedure :: sum_zintox => sum_zintox_cuda
       procedure :: vecadd => vecadd_cuda
-      procedure :: set_fields => set_fields_cuda
-      procedure :: get_fields => get_fields_cuda
+      procedure :: scalar_product => scalar_product_cuda
+      procedure :: set_field => set_field_cuda
+      procedure :: get_field => get_field_cuda
       procedure :: transeq_cuda_dist
       procedure :: transeq_cuda_thom
       procedure :: tds_solve_dist
@@ -529,6 +531,35 @@ subroutine vecadd_cuda(self, a, x, b, y)
 
    end subroutine vecadd_cuda
 
+   real(dp) function scalar_product_cuda(self, x, y) result(s)
+      implicit none
+
+      class(cuda_backend_t) :: self
+      class(field_t), intent(in) :: x, y
+
+      real(dp), device, pointer, dimension(:, :, :) :: x_d, y_d
+      real(dp), device, allocatable :: sum_d
+      type(dim3) :: blocks, threads
+      integer :: n, ierr
+
+      select type(x); type is (cuda_field_t); x_d => x%data_d; end select
+      select type(y); type is (cuda_field_t); y_d => y%data_d; end select
+
+      allocate (sum_d)
+      sum_d = 0._dp
+
+      n = size(x_d, dim = 2)
+      blocks = dim3(size(x_d, dim = 3), 1, 1)
+      threads = dim3(SZ, 1, 1)
+      call scalar_product<<<blocks, threads>>>(sum_d, x_d, y_d, n)
+
+      s = sum_d
+
+      call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, &
+                         MPI_COMM_WORLD, ierr)
+
+   end function scalar_product_cuda
+
    subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
       implicit none
 
@@ -547,31 +578,27 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
 
    end subroutine copy_into_buffers
 
-   subroutine set_fields_cuda(self, u, v, w, u_in, v_in, w_in)
+   subroutine set_field_cuda(self, f, arr)
       implicit none
 
       class(cuda_backend_t) :: self
-      class(field_t), intent(inout) :: u, v, w
-      real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+      class(field_t), intent(inout) :: f
+      real(dp), dimension(:, :, :), intent(in) :: arr
 
-      select type(u); type is (cuda_field_t); u%data_d = u_in; end select
-      select type(v); type is (cuda_field_t); v%data_d = v_in; end select
-      select type(w); type is (cuda_field_t); w%data_d = w_in; end select
+      select type(f); type is (cuda_field_t); f%data_d = arr; end select
 
-   end subroutine set_fields_cuda
+   end subroutine set_field_cuda
 
-   subroutine get_fields_cuda(self, u_out, v_out, w_out, u, v, w)
+   subroutine get_field_cuda(self, arr, f)
       implicit none
 
       class(cuda_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-      class(field_t), intent(in) :: u, v, w
+      real(dp), dimension(:, :, :), intent(out) :: arr
+      class(field_t), intent(in) :: f
 
-      select type(u); type is (cuda_field_t); u_out = u%data_d; end select
-      select type(v); type is (cuda_field_t); v_out = v%data_d; end select
-      select type(w); type is (cuda_field_t); w_out = w%data_d; end select
+      select type(f); type is (cuda_field_t); arr = f%data_d; end select
 
-   end subroutine get_fields_cuda
+   end subroutine get_field_cuda
 
 end module m_cuda_backend
 
diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90
index 52c093c5..11c74cac 100644
--- a/src/cuda/kernels/reorder.f90
+++ b/src/cuda/kernels/reorder.f90
@@ -199,6 +199,27 @@ attributes(global) subroutine axpby(n, alpha, x, beta, y)
 
    end subroutine axpby
 
+   attributes(global) subroutine scalar_product(s, x, y, n)
+      implicit none
+
+      real(dp), device, intent(inout) :: s
+      real(dp), device, intent(in), dimension(:, :, :) :: x, y
+      integer, value, intent(in) :: n
+
+      real(dp) :: s_pncl !! pencil sum
+      integer :: i, j, b, ierr
+
+      i = threadIdx%x
+      b = blockIdx%x
+
+      s_pncl = 0._dp
+      do j = 1, n
+         s_pncl = s_pncl + x(i, j, b)*y(i, j, b)
+      end do
+      ierr = atomicadd(s, s_pncl)
+
+   end subroutine scalar_product
+
    attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo)
       implicit none
 
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index dc6f7a1f..aca287bd 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -30,8 +30,9 @@ module m_omp_backend
       procedure :: sum_yintox => sum_yintox_omp
       procedure :: sum_zintox => sum_zintox_omp
       procedure :: vecadd => vecadd_omp
-      procedure :: set_fields => set_fields_omp
-      procedure :: get_fields => get_fields_omp
+      procedure :: scalar_product => scalar_product_omp
+      procedure :: set_field => set_field_omp
+      procedure :: get_field => get_field_omp
       procedure :: transeq_omp_dist
    end type omp_backend_t
 
@@ -304,6 +305,16 @@ subroutine vecadd_omp(self, a, x, b, y)
 
    end subroutine vecadd_omp
 
+   real(dp) function scalar_product_omp(self, x, y) result(s)
+      implicit none
+
+      class(omp_backend_t) :: self
+      class(field_t), intent(in) :: x, y
+
+      s = 0._dp
+
+   end function scalar_product_omp
+
    subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
       implicit none
 
@@ -329,31 +340,27 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
 
    end subroutine copy_into_buffers
 
-   subroutine set_fields_omp(self, u, v, w, u_in, v_in, w_in)
+   subroutine set_field_omp(self, f, arr)
       implicit none
 
       class(omp_backend_t) :: self
-      class(field_t), intent(inout) :: u, v, w
-      real(dp), dimension(:, :, :), intent(in) :: u_in, v_in, w_in
+      class(field_t), intent(inout) :: f
+      real(dp), dimension(:, :, :), intent(in) :: arr
 
-      u%data = u_in
-      v%data = v_in
-      w%data = w_in
+      f%data = arr
 
-   end subroutine set_fields_omp
+   end subroutine set_field_omp
 
-   subroutine get_fields_omp(self, u_out, v_out, w_out, u, v, w)
+   subroutine get_field_omp(self, arr, f)
       implicit none
 
       class(omp_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: u_out, v_out, w_out
-      class(field_t), intent(in) :: u, v, w
+      real(dp), dimension(:, :, :), intent(out) :: arr
+      class(field_t), intent(in) :: f
 
-      u_out = u%data
-      v_out = v%data
-      w_out = w%data
+      arr = f%data
 
-   end subroutine get_fields_omp
+   end subroutine get_field_omp
 
 end module m_omp_backend
 
diff --git a/src/solver.f90 b/src/solver.f90
index 60eead1c..fad0a715 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -93,9 +93,9 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       v_init = 0
       w_init = 0
 
-      call solver%backend%set_fields( &
-         solver%u, solver%v, solver%w, u_init, v_init, w_init &
-      )
+      call solver%backend%set_field(solver%u, u_init)
+      call solver%backend%set_field(solver%v, v_init)
+      call solver%backend%set_field(solver%w, w_init)
 
       deallocate(u_init, v_init, w_init)
       print*, 'initial conditions are set'
@@ -548,9 +548,9 @@ subroutine run(self, n_iter, u_out, v_out, w_out)
 
       print*, 'run end'
 
-      call self%backend%get_fields( &
-         u_out, v_out, w_out, self%u, self%v, self%w &
-      )
+      call self%backend%get_field(u_out, self%u)
+      call self%backend%get_field(v_out, self%v)
+      call self%backend%get_field(w_out, self%w)
 
    end subroutine run