From f469f7171413947d72786eadd5cf79e6803f595b Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Thu, 14 Mar 2024 14:52:27 +0000
Subject: [PATCH 01/11] Add c2x and x2c reorder kernels in CUDA backend.

---
 src/cuda/kernels/reorder.f90 | 46 ++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90
index 11c74cac..52ccd0e5 100644
--- a/src/cuda/kernels/reorder.f90
+++ b/src/cuda/kernels/reorder.f90
@@ -6,6 +6,52 @@ module m_cuda_kernels_reorder
 
 contains
 
+   attributes(global) subroutine reorder_c2x(u_x, u_c, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_x
+      real(dp), device, intent(in), dimension(:, :, :) :: u_c
+      integer, value, intent(in) :: nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) = tile(j, i)
+
+   end subroutine reorder_c2x
+
+   attributes(global) subroutine reorder_x2c(u_c, u_x, nz)
+      implicit none
+
+      real(dp), device, intent(out), dimension(:, :, :) :: u_c
+      real(dp), device, intent(in), dimension(:, :, :) :: u_x
+      integer, value, intent(in) :: nz
+
+      real(dp), shared :: tile(SZ, SZ)
+      integer :: i, j, b_i, b_j, b_k
+
+      i = threadIdx%x; j = threadIdx%y
+      b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z
+
+      ! copy into shared
+      tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz)
+
+      call syncthreads()
+
+      ! copy into output array from shared
+      u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) = tile(j, i)
+
+   end subroutine reorder_x2c
+
    attributes(global) subroutine reorder_x2y(u_y, u_x, nz)
       implicit none
 

From 2dfd0d99ce7e51161a60e8b8bec3e5352cf17212 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Thu, 14 Mar 2024 14:53:19 +0000
Subject: [PATCH 02/11] Add a test for c2x and x2c reorder kernels.

---
 tests/cuda/test_cuda_reorder.f90 | 35 +++++++++++++++++++++++++++++---
 1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90
index bacd3450..2c8f6800 100644
--- a/tests/cuda/test_cuda_reorder.f90
+++ b/tests/cuda/test_cuda_reorder.f90
@@ -6,13 +6,15 @@ program test_cuda_reorder
    use m_common, only: dp
    use m_cuda_common, only: SZ
    use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, &
-                                     reorder_y2z, reorder_z2x, reorder_z2y
+                                     reorder_y2z, reorder_z2x, reorder_z2y, &
+                                     reorder_c2x, reorder_x2c
 
    implicit none
 
    logical :: allpass = .true.
-   real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp
-   real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, u_temp_d
+   real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp, u_c
+   real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, &
+                                                        u_temp_d, u_c_d
 
    integer :: n_block, i, n_iters
    integer :: nx, ny, nz, ndof
@@ -46,6 +48,10 @@ program test_cuda_reorder
    allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block))
    allocate (u_temp_d(SZ, nx, n_block))
 
+   ! Cartesian order storage
+   allocate (u_c_d(nx, ny, nz))
+   allocate (u_c(nx, ny, nz))
+
    ! set a random field
    call random_number(u_i)
 
@@ -126,6 +132,29 @@ program test_cuda_reorder
       end if
    end if
 
+   ! x ordering into Cartesian ordering
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call reorder_x2c<<<blocks, threads>>>(u_c_d, u_i_d, nz)
+
+   ! sanitise u_o_d
+   u_o_d = 0
+
+   ! Cartesian ordering back to x ordering
+   call reorder_c2x<<<blocks, threads>>>(u_o_d, u_c_d, nz)
+   u_o = u_o_d
+
+   ! now both u_o and u_i in x ordering, compare them
+   norm_u = norm2(u_o - u_i)
+   if (nrank == 0) then
+      if ( norm_u > tol ) then
+         allpass = .false.
+         write(stderr, '(a)') 'Check reorder x2c and c2x... failed'
+      else
+         write(stderr, '(a)') 'Check reorder x2c and c2x... passed'
+      end if
+   end if
+
    ! Now the performance checks
    call cpu_time(tstart)
    do i = 1, n_iters

From 0fa4f78fd97d2105136621e0649349fd3f731f58 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Thu, 14 Mar 2024 14:57:01 +0000
Subject: [PATCH 03/11] Add performance checks for the c2x/x2c reorder kernels
 in the tests.

---
 tests/cuda/test_cuda_reorder.f90 | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90
index 2c8f6800..595d57fb 100644
--- a/tests/cuda/test_cuda_reorder.f90
+++ b/tests/cuda/test_cuda_reorder.f90
@@ -216,6 +216,24 @@ program test_cuda_reorder
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      call reorder_x2c<<<blocks, threads>>>(u_c_d, u_i_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
+   call cpu_time(tstart)
+   do i = 1, n_iters
+      call reorder_c2x<<<blocks, threads>>>(u_o_d, u_c_d, nz)
+   end do
+   call cpu_time(tend)
+
+   call checkperf(tend - tstart, n_iters, ndof, 2._dp)
+
    if (allpass) then
       if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
    else

From 39c79dd521499b01a1bce2adcc7ae9f96709c853 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Thu, 14 Mar 2024 15:22:06 +0000
Subject: [PATCH 04/11] Remove MPI stuff from reorder tests.

---
 tests/cuda/test_cuda_reorder.f90 | 97 ++++++++++----------------------
 1 file changed, 31 insertions(+), 66 deletions(-)

diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90
index 595d57fb..f948c35f 100644
--- a/tests/cuda/test_cuda_reorder.f90
+++ b/tests/cuda/test_cuda_reorder.f90
@@ -1,7 +1,6 @@
 program test_cuda_reorder
    use iso_fortran_env, only: stderr => error_unit
    use cudafor
-   use mpi
 
    use m_common, only: dp
    use m_cuda_common, only: SZ
@@ -18,25 +17,10 @@ program test_cuda_reorder
 
    integer :: n_block, i, n_iters
    integer :: nx, ny, nz, ndof
-   integer :: nrank, nproc, pprev, pnext
-   integer :: ierr, ndevs, devnum
 
    type(dim3) :: blocks, threads
    real(dp) :: norm_u, tol = 1d-8, tstart, tend
 
-   call MPI_Init(ierr)
-   call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr)
-   call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr)
-
-   if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks'
-
-   ierr = cudaGetDeviceCount(ndevs)
-   ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin
-   ierr = cudaGetDevice(devnum)
-
-   !print*, 'I am rank', nrank, 'I am running on device', devnum
-   pnext = modulo(nrank - nproc + 1, nproc)
-   pprev = modulo(nrank - 1, nproc)
 
    nx = 512; ny = 512; nz = 512
    n_block = ny*nz/SZ
@@ -72,13 +56,11 @@ program test_cuda_reorder
 
    ! and check whether it matches the initial random field
    norm_u = norm2(u_o - u_i)
-   if (nrank == 0) then
-      if ( norm_u > tol ) then
-         allpass = .false.
-         write(stderr, '(a)') 'Check reorder x2y and y2x... failed'
-      else
-         write(stderr, '(a)') 'Check reorder x2y and y2x... passed'
-      end if
+   if ( norm_u > tol ) then
+      allpass = .false.
+      write(stderr, '(a)') 'Check reorder x2y and y2x... failed'
+   else
+      write(stderr, '(a)') 'Check reorder x2y and y2x... passed'
    end if
 
    ! we reuse u_o_d so zeroize in any case
@@ -101,13 +83,11 @@ program test_cuda_reorder
 
    ! compare two y oriented fields
    norm_u = norm2(u_o - u_temp)
-   if (nrank == 0) then
-      if ( norm_u > tol ) then
-         allpass = .false.
-         write(stderr, '(a)') 'Check reorder y2z and y2z... failed'
-      else
-         write(stderr, '(a)') 'Check reorder y2z and y2z... passed'
-      end if
+   if ( norm_u > tol ) then
+      allpass = .false.
+      write(stderr, '(a)') 'Check reorder y2z and y2z... failed'
+   else
+      write(stderr, '(a)') 'Check reorder y2z and y2z... passed'
    end if
 
    ! reorder initial random field into z orientation
@@ -123,13 +103,11 @@ program test_cuda_reorder
 
    ! compare two z oriented fields
    norm_u = norm2(u_o - u_i)
-   if (nrank == 0) then
-      if ( norm_u > tol ) then
-         allpass = .false.
-         write(stderr, '(a)') 'Check reorder x2z and z2x... failed'
-      else
-         write(stderr, '(a)') 'Check reorder x2z and z2x... passed'
-      end if
+   if ( norm_u > tol ) then
+      allpass = .false.
+      write(stderr, '(a)') 'Check reorder x2z and z2x... failed'
+   else
+      write(stderr, '(a)') 'Check reorder x2z and z2x... passed'
    end if
 
    ! x ordering into Cartesian ordering
@@ -146,13 +124,17 @@ program test_cuda_reorder
 
    ! now both u_o and u_i in x ordering, compare them
    norm_u = norm2(u_o - u_i)
-   if (nrank == 0) then
-      if ( norm_u > tol ) then
-         allpass = .false.
-         write(stderr, '(a)') 'Check reorder x2c and c2x... failed'
-      else
-         write(stderr, '(a)') 'Check reorder x2c and c2x... passed'
-      end if
+   if ( norm_u > tol ) then
+      allpass = .false.
+      write(stderr, '(a)') 'Check reorder x2c and c2x... failed'
+   else
+      write(stderr, '(a)') 'Check reorder x2c and c2x... passed'
+   end if
+
+   if (allpass) then
+      write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
+   else
+      error stop 'SOME TESTS FAILED.'
    end if
 
    ! Now the performance checks
@@ -234,14 +216,6 @@ program test_cuda_reorder
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
-   if (allpass) then
-      if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.'
-   else
-      error stop 'SOME TESTS FAILED.'
-   end if
-
-   call MPI_Finalize(ierr)
-
 contains
 
    subroutine checkperf(t_tot, n_iters, ndof, consumed_bw)
@@ -250,31 +224,22 @@ subroutine checkperf(t_tot, n_iters, ndof, consumed_bw)
       real(dp), intent(in) :: t_tot, consumed_bw
       integer, intent(in) :: n_iters, ndof
 
-      real(dp) :: achievedBW, devBW, achievedBWmax, achievedBWmin
+      real(dp) :: achievedBW, devBW
       integer :: ierr, memClockRt, memBusWidth
 
       ! BW utilisation and performance checks
       achievedBW = consumed_bw*n_iters*ndof*dp/t_tot
-      call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, &
-                         MPI_MAX, MPI_COMM_WORLD, ierr)
-      call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, &
-                         MPI_MIN, MPI_COMM_WORLD, ierr)
 
-      if (nrank == 0) then
-         print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s'
-         print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s'
-      end if
+      print'(a, f8.3, a)', 'Achieved BW: ', achievedBW/2**30, ' GiB/s'
 
       ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0)
       ierr = cudaDeviceGetAttribute(memBusWidth, &
                                     cudaDevAttrGlobalMemoryBusWidth, 0)
       devBW = 2*memBusWidth/8._dp*memClockRt*1000
 
-      if (nrank == 0) then
-         print'(a, f8.3, a)', 'Device BW:   ', devBW/2**30, ' GiB/s'
-         print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/devBW*100
-         print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/devBW*100
-      end if
+      print'(a, f8.3, a)', 'Device BW:   ', devBW/2**30, ' GiB/s'
+      print'(a, f5.2)', 'Effective BW util: %', achievedBW/devBW*100
+
    end subroutine checkperf
 
 end program test_cuda_reorder

From 1353cd003576db64a38f1f29ebf5f55d40fe19bb Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Thu, 14 Mar 2024 15:30:33 +0000
Subject: [PATCH 05/11] Cleanup performance tests.

---
 tests/cuda/test_cuda_reorder.f90 | 35 +++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90
index f948c35f..d3bbf97c 100644
--- a/tests/cuda/test_cuda_reorder.f90
+++ b/tests/cuda/test_cuda_reorder.f90
@@ -138,66 +138,74 @@ program test_cuda_reorder
    end if
 
    ! Now the performance checks
+
+   print*, 'Performance test: reorder_x2y'
+   blocks = dim3(nx/SZ, nz, ny/SZ)
+   threads = dim3(SZ, SZ, 1)
    call cpu_time(tstart)
    do i = 1, n_iters
-      blocks = dim3(nx/SZ, nz, ny/SZ)
-      threads = dim3(SZ, SZ, 1)
       call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, nz)
    end do
    call cpu_time(tend)
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   print*, 'Performance test: reorder_x2z'
+   blocks = dim3(nx, ny/SZ, 1)
+   threads = dim3(SZ, 1, 1)
    call cpu_time(tstart)
    do i = 1, n_iters
-      blocks = dim3(nx, ny/SZ, 1)
-      threads = dim3(SZ, 1, 1)
       call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, nz)
    end do
    call cpu_time(tend)
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   print*, 'Performance test: reorder_y2x'
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
    call cpu_time(tstart)
    do i = 1, n_iters
-      blocks = dim3(nx/SZ, ny/SZ, nz)
-      threads = dim3(SZ, SZ, 1)
       call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, nz)
    end do
    call cpu_time(tend)
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   print*, 'Performance test: reorder_y2z'
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
    call cpu_time(tstart)
    do i = 1, n_iters
-      blocks = dim3(nx/SZ, ny/SZ, nz)
-      threads = dim3(SZ, SZ, 1)
       call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
    end do
    call cpu_time(tend)
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   print*, 'Performance test: reorder_z2x'
+   blocks = dim3(nx, ny/SZ, 1)
+   threads = dim3(SZ, 1, 1)
    call cpu_time(tstart)
    do i = 1, n_iters
-      blocks = dim3(nx, ny/SZ, 1)
-      threads = dim3(SZ, 1, 1)
       call reorder_z2x<<<blocks, threads>>>(u_o_d, u_i_d, nz)
    end do
    call cpu_time(tend)
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   print*, 'Performance test: reorder_z2y'
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
    call cpu_time(tstart)
    do i = 1, n_iters
-      blocks = dim3(nx/SZ, ny/SZ, nz)
-      threads = dim3(SZ, SZ, 1)
       call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, nx, nz)
    end do
    call cpu_time(tend)
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   print*, 'Performance test: reorder_x2c'
    blocks = dim3(nx/SZ, ny/SZ, nz)
    threads = dim3(SZ, SZ, 1)
    call cpu_time(tstart)
@@ -208,6 +216,9 @@ program test_cuda_reorder
 
    call checkperf(tend - tstart, n_iters, ndof, 2._dp)
 
+   print*, 'Performance test: reorder_c2x'
+   blocks = dim3(nx/SZ, ny/SZ, nz)
+   threads = dim3(SZ, SZ, 1)
    call cpu_time(tstart)
    do i = 1, n_iters
       call reorder_c2x<<<blocks, threads>>>(u_o_d, u_c_d, nz)

From 1cc36e5c544026c1f77f680ba93bea05c8301446 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Thu, 14 Mar 2024 16:04:39 +0000
Subject: [PATCH 06/11] Make c2x/x2c reorder kernels available in CUDA backend.

---
 src/common.f90       |  3 ++-
 src/cuda/backend.f90 | 11 ++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/common.f90 b/src/common.f90
index 7830ac09..6232bcdd 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -5,7 +5,8 @@ module m_common
    real(dp), parameter :: pi = 4*atan(1.0_dp)
 
    integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
-                         RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32
+                         RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, &
+                         RDR_C2X = 41, RDR_X2C = 14
    integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4
    integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1
 
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 9112c62f..6d10123d 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -6,7 +6,8 @@ module m_cuda_backend
    use m_allocator, only: allocator_t, field_t
    use m_base_backend, only: base_backend_t
    use m_common, only: dp, globs_t, &
-                       RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y
+                       RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, &
+                       RDR_C2X, RDR_X2C
    use m_poisson_fft, only: poisson_fft_t
    use m_tdsops, only: dirps_t, tdsops_t
 
@@ -450,6 +451,14 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
          threads = dim3(SZ, SZ, 1)
          call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
+      case (RDR_C2X) ! c2x
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_c2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_X2C) ! x2c
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_x2c<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
       case default
          error stop 'Reorder direction is undefined.'
       end select

From 3b376e27565abec8e306c240a17b8bc742543a54 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Wed, 20 Mar 2024 13:30:44 +0000
Subject: [PATCH 07/11] Add support for reorders for Y2C, Z2C, C2Y, C2Z in CUDA
 backend.

---
 src/common.f90       |  3 +-
 src/cuda/backend.f90 | 79 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 70 insertions(+), 12 deletions(-)

diff --git a/src/common.f90 b/src/common.f90
index 6232bcdd..181781d7 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -6,7 +6,8 @@ module m_common
 
    integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, &
                          RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, &
-                         RDR_C2X = 41, RDR_X2C = 14
+                         RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, &
+                         RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34
    integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4
    integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1
 
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 6d10123d..5acadf6f 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -7,7 +7,8 @@ module m_cuda_backend
    use m_base_backend, only: base_backend_t
    use m_common, only: dp, globs_t, &
                        RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, &
-                       RDR_C2X, RDR_X2C
+                       RDR_C2X, RDR_C2Y, RDR_C2Z, RDR_X2C, RDR_Y2C, RDR_Z2C, &
+                       DIR_X, DIR_Y, DIR_Z, DIR_C
    use m_poisson_fft, only: poisson_fft_t
    use m_tdsops, only: dirps_t, tdsops_t
 
@@ -20,7 +21,8 @@ module m_cuda_backend
    use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs
    use m_cuda_kernels_reorder, only: &
        reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, &
-       reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy
+       reorder_z2y, reorder_c2x, reorder_x2c, &
+       sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy
 
    implicit none
 
@@ -418,47 +420,102 @@ subroutine reorder_cuda(self, u_o, u_i, direction)
       class(field_t), intent(in) :: u_i
       integer, intent(in) :: direction
 
-      real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d
+      real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d, u_temp_d
+      class(field_t), pointer :: u_temp
       type(dim3) :: blocks, threads
 
       call resolve_field_t(u_o_d, u_o)
       call resolve_field_t(u_i_d, u_i)
 
       select case (direction)
-      case (RDR_X2Y) ! x2y
+      case (RDR_X2Y)
          blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
          threads = dim3(SZ, SZ, 1)
          call reorder_x2y<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (RDR_X2Z) ! x2z
+      case (RDR_X2Z)
          blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
          threads = dim3(SZ, 1, 1)
          call reorder_x2z<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (RDR_Y2X) ! y2x
+      case (RDR_Y2X)
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
          call reorder_y2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (RDR_Y2Z) ! y2z
+      case (RDR_Y2Z)
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
          call reorder_y2z<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
-      case (RDR_Z2X) ! z2x
+      case (RDR_Z2X)
          blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
          threads = dim3(SZ, 1, 1)
          call reorder_z2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (RDR_Z2Y) ! z2y
+      case (RDR_Z2Y)
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
          call reorder_z2y<<<blocks, threads>>>(u_o_d, u_i_d, &
                                                self%nx_loc, self%nz_loc)
-      case (RDR_C2X) ! c2x
+      case (RDR_C2X)
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
          call reorder_c2x<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
-      case (RDR_X2C) ! x2c
+      case (RDR_C2Y)
+         ! First reorder from C to X, then from X to Y
+         u_temp => self%allocator%get_block(DIR_X)
+         call resolve_field_t(u_temp_d, u_temp)
+
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_c2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)
+
+         blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_x2y<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)
+
+         call self%allocator%release_block(u_temp)
+      case (RDR_C2Z)
+         ! First reorder from C to X, then from X to Z
+         u_temp => self%allocator%get_block(DIR_X)
+         call resolve_field_t(u_temp_d, u_temp)
+
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_c2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)
+
+         blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
+         threads = dim3(SZ, 1, 1)
+         call reorder_x2z<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)
+
+         call self%allocator%release_block(u_temp)
+      case (RDR_X2C)
          blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
          threads = dim3(SZ, SZ, 1)
          call reorder_x2c<<<blocks, threads>>>(u_o_d, u_i_d, self%nz_loc)
+      case (RDR_Y2C)
+         ! First reorder from Y to X, then from X to C
+         u_temp => self%allocator%get_block(DIR_X)
+         call resolve_field_t(u_temp_d, u_temp)
+
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_y2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)
+
+         call reorder_x2c<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)
+
+         call self%allocator%release_block(u_temp)
+      case (RDR_Z2C)
+         ! First reorder from Z to X, then from X to C
+         u_temp => self%allocator%get_block(DIR_X)
+         call resolve_field_t(u_temp_d, u_temp)
+
+         blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1)
+         threads = dim3(SZ, 1, 1)
+         call reorder_z2x<<<blocks, threads>>>(u_temp_d, u_i_d, self%nz_loc)
+
+         blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc)
+         threads = dim3(SZ, SZ, 1)
+         call reorder_x2c<<<blocks, threads>>>(u_o_d, u_temp_d, self%nz_loc)
+
+         call self%allocator%release_block(u_temp)
       case default
          error stop 'Reorder direction is undefined.'
       end select

From 8128e80ffea281831907d18e88ebf4c42caa977e Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Wed, 20 Mar 2024 18:11:23 +0000
Subject: [PATCH 08/11] Improve get field and set field operations.

---
 src/backend.f90      | 103 +++++++++++++++++++++++++++++++++++--------
 src/common.f90       |  13 ++++++
 src/cuda/backend.f90 |  30 +++++--------
 src/omp/backend.f90  |  30 +++++--------
 src/solver.f90       |  14 +++---
 5 files changed, 129 insertions(+), 61 deletions(-)

diff --git a/src/backend.f90 b/src/backend.f90
index e17970e5..73bfd01b 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -1,6 +1,6 @@
 module m_base_backend
    use m_allocator, only: allocator_t, field_t
-   use m_common, only: dp
+   use m_common, only: dp, DIR_C, get_rdr_from_dirs
    use m_poisson_fft, only: poisson_fft_t
    use m_tdsops, only: tdsops_t, dirps_t
 
@@ -36,10 +36,12 @@ module m_base_backend
       procedure(sum_intox), deferred :: sum_zintox
       procedure(vecadd), deferred :: vecadd
       procedure(scalar_product), deferred :: scalar_product
-      procedure(get_field), deferred :: get_field
-      procedure(set_field), deferred :: set_field
+      procedure(copy_data_to_f), deferred :: copy_data_to_f
+      procedure(copy_f_to_data), deferred :: copy_f_to_data
       procedure(alloc_tdsops), deferred :: alloc_tdsops
       procedure(init_poisson_fft), deferred :: init_poisson_fft
+      procedure :: get_field_data
+      procedure :: set_field_data
    end type base_backend_t
 
    abstract interface
@@ -143,32 +145,31 @@ end function scalar_product
    end interface
 
    abstract interface
-      subroutine get_field(self, arr, f)
-         !! copy the specialist data structure from device or host back
-         !! to a regular 3D data structure.
+      subroutine copy_data_to_f(self, f, data)
+         !! Copy the specialist data structure from device or host back
+         !! to a regular 3D data array in host memory.
          import :: base_backend_t
          import :: dp
          import :: field_t
          implicit none
 
-         class(base_backend_t) :: self
-         real(dp), dimension(:, :, :), intent(out) :: arr
-         class(field_t), intent(in) :: f
-      end subroutine get_field
+         class(base_backend_t), intent(inout) :: self
+         class(field_t), intent(inout) :: f
+         real(dp), dimension(:, :, :), intent(in) :: data
+      end subroutine copy_data_to_f
 
-      subroutine set_field(self, f, arr)
-         !! copy the initial condition stored in a regular 3D data
-         !! structure into the specialist data structure array on the
-         !! device or host.
+      subroutine copy_f_to_data(self, data, f)
+         !! Copy a regular 3D array in host memory into the specialist
+         !! data structure field that lives on device or host
          import :: base_backend_t
          import :: dp
          import :: field_t
          implicit none
 
-         class(base_backend_t) :: self
-         class(field_t), intent(inout) :: f
-         real(dp), dimension(:, :, :), intent(in) :: arr
-      end subroutine set_field
+         class(base_backend_t), intent(inout) :: self
+         real(dp), dimension(:, :, :), intent(out) :: data
+         class(field_t), intent(in) :: f
+      end subroutine copy_f_to_data
    end interface
 
    abstract interface
@@ -202,4 +203,70 @@ subroutine init_poisson_fft(self, xdirps, ydirps, zdirps)
       end subroutine init_poisson_fft
    end interface
 
+contains
+
+   subroutine get_field_data(self, data, f, dir)
+      implicit none
+
+      class(base_backend_t) :: self
+      real(dp), dimension(:, :, :), intent(out) :: data
+      class(field_t), intent(in) :: f
+      integer, optional, intent(in) :: dir
+
+      class(field_t), pointer :: f_temp
+      integer :: direction, rdr_dir
+
+      if (present(dir)) then
+         direction = dir
+      else
+         direction = DIR_C
+      end if
+
+      ! Returns 0 if no reorder required
+      rdr_dir = get_rdr_from_dirs(direction, f%dir)
+
+      ! Carry out a reorder if we need, and copy from field to data array
+      if (rdr_dir /= 0) then
+         f_temp => self%allocator%get_block(direction)
+         call self%reorder(f_temp, f, rdr_dir)
+         call self%copy_f_to_data(data, f_temp)
+         call self%allocator%release_block(f_temp)
+      else
+         call self%copy_f_to_data(data, f)
+      end if
+
+   end subroutine get_field_data
+
+   subroutine set_field_data(self, f, data, dir)
+      implicit none
+
+      class(base_backend_t) :: self
+      class(field_t), intent(inout) :: f
+      real(dp), dimension(:, :, :), intent(in) :: data
+      integer, optional, intent(in) :: dir
+
+      class(field_t), pointer :: f_temp
+      integer :: direction, rdr_dir
+
+      if (present(dir)) then
+         direction = dir
+      else
+         direction = DIR_C
+      end if
+
+      ! Returns 0 if no reorder required
+      rdr_dir = get_rdr_from_dirs(f%dir, direction)
+
+      ! Carry out a reorder if we need, and copy from data array to field
+      if (rdr_dir /= 0) then
+         f_temp => self%allocator%get_block(direction)
+         call self%copy_data_to_f(f_temp, data)
+         call self%reorder(f, f_temp, rdr_dir)
+         call self%allocator%release_block(f_temp)
+      else
+         call self%copy_data_to_f(f, data)
+      end if
+
+   end subroutine set_field_data
+
 end module m_base_backend
diff --git a/src/common.f90 b/src/common.f90
index 181781d7..a2ceab27 100644
--- a/src/common.f90
+++ b/src/common.f90
@@ -11,6 +11,12 @@ module m_common
    integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4
    integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1
 
+   integer, protected :: &
+      rdr_map(4, 4) = reshape([0, RDR_X2Y, RDR_X2Z, RDR_X2C, &
+                               RDR_Y2X, 0, RDR_Y2Z, RDR_Y2C, &
+                               RDR_Z2X, RDR_Z2Y, 0, RDR_Z2C, &
+                               RDR_C2X, RDR_C2Y, RDR_C2Z, 0], shape=[4, 4])
+
    type :: globs_t
       integer :: nx, ny, nz
       integer :: nx_loc, ny_loc, nz_loc
@@ -81,4 +87,11 @@ subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, &
 
    end subroutine set_pprev_pnext
 
+   integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir)
+      !! Returns RDR_?2? value based on two direction inputs
+      integer, intent(in) :: dir_from, dir_to
+
+      rdr_dir = rdr_map(dir_from, dir_to)
+   end function get_rdr_from_dirs
+
 end module m_common
diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90
index 5acadf6f..ff7f6dad 100644
--- a/src/cuda/backend.f90
+++ b/src/cuda/backend.f90
@@ -50,8 +50,8 @@ module m_cuda_backend
       procedure :: sum_zintox => sum_zintox_cuda
       procedure :: vecadd => vecadd_cuda
       procedure :: scalar_product => scalar_product_cuda
-      procedure :: set_field => set_field_cuda
-      procedure :: get_field => get_field_cuda
+      procedure :: copy_data_to_f => copy_data_to_f_cuda
+      procedure :: copy_f_to_data => copy_f_to_data_cuda
       procedure :: init_poisson_fft => init_cuda_poisson_fft
       procedure :: transeq_cuda_dist
       procedure :: transeq_cuda_thom
@@ -630,27 +630,21 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n)
 
    end subroutine copy_into_buffers
 
-   subroutine set_field_cuda(self, f, arr)
-      implicit none
-
-      class(cuda_backend_t) :: self
+   subroutine copy_data_to_f_cuda(self, f, data)
+      class(cuda_backend_t), intent(inout) :: self
       class(field_t), intent(inout) :: f
-      real(dp), dimension(:, :, :), intent(in) :: arr
-
-      select type(f); type is (cuda_field_t); f%data_d = arr; end select
+      real(dp), dimension(:, :, :), intent(inout) :: data
 
-   end subroutine set_field_cuda
+      select type(f); type is (cuda_field_t); f%data_d = data; end select
+   end subroutine copy_data_to_f_cuda
 
-   subroutine get_field_cuda(self, arr, f)
-      implicit none
-
-      class(cuda_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: arr
+   subroutine copy_f_to_data_cuda(self, data, f)
+      class(cuda_backend_t), intent(inout) :: self
+      real(dp), dimension(:, :, :), intent(out) :: data
       class(field_t), intent(in) :: f
 
-      select type(f); type is (cuda_field_t); arr = f%data_d; end select
-
-   end subroutine get_field_cuda
+      select type(f); type is (cuda_field_t); data = f%data_d; end select
+   end subroutine copy_f_to_data_cuda
 
    subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps)
       implicit none
diff --git a/src/omp/backend.f90 b/src/omp/backend.f90
index 20fcfc53..c1c8c2e1 100644
--- a/src/omp/backend.f90
+++ b/src/omp/backend.f90
@@ -34,8 +34,8 @@ module m_omp_backend
       procedure :: sum_zintox => sum_zintox_omp
       procedure :: vecadd => vecadd_omp
       procedure :: scalar_product => scalar_product_omp
-      procedure :: set_field => set_field_omp
-      procedure :: get_field => get_field_omp
+      procedure :: copy_data_to_f => copy_data_to_f_omp
+      procedure :: copy_f_to_data => copy_f_to_data_omp
       procedure :: init_poisson_fft => init_omp_poisson_fft
       procedure :: transeq_omp_dist
    end type omp_backend_t
@@ -363,27 +363,21 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks)
 
    end subroutine copy_into_buffers
 
-   subroutine set_field_omp(self, f, arr)
-      implicit none
-
-      class(omp_backend_t) :: self
+   subroutine copy_data_to_f_omp(self, f, data)
+      class(omp_backend_t), intent(inout) :: self
       class(field_t), intent(inout) :: f
-      real(dp), dimension(:, :, :), intent(in) :: arr
-
-      f%data = arr
+      real(dp), dimension(:, :, :), intent(in) :: data
 
-   end subroutine set_field_omp
+      f%data = data
+   end subroutine copy_data_to_f_omp
 
-   subroutine get_field_omp(self, arr, f)
-      implicit none
-
-      class(omp_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: arr
+   subroutine copy_f_to_data_omp(self, data, f)
+      class(omp_backend_t), intent(inout) :: self
+      real(dp), dimension(:, :, :), intent(out) :: data
       class(field_t), intent(in) :: f
 
-      arr = f%data
-
-   end subroutine get_field_omp
+      data = f%data
+   end subroutine copy_f_to_data_omp
 
    subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps)
       implicit none
diff --git a/src/solver.f90 b/src/solver.f90
index 2e1fa384..fcf67049 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -133,9 +133,9 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
          end do
       end do
 
-      call solver%backend%set_field(solver%u, u_init)
-      call solver%backend%set_field(solver%v, v_init)
-      call solver%backend%set_field(solver%w, w_init)
+      call solver%backend%set_field_data(solver%u, u_init, 1)
+      call solver%backend%set_field_data(solver%v, v_init, 1)
+      call solver%backend%set_field_data(solver%w, w_init, 1)
 
       deallocate(u_init, v_init, w_init)
       print*, 'initial conditions are set'
@@ -595,7 +595,7 @@ subroutine output(self, t, u_out)
       div_u => self%backend%allocator%get_block(DIR_Z)
 
       call self%divergence_v2p(div_u, self%u, self%v, self%w)
-      call self%backend%get_field(u_out, div_u)
+      call self%backend%get_field_data(u_out, div_u, 3)
 
       call self%backend%allocator%release_block(div_u)
 
@@ -671,9 +671,9 @@ subroutine run(self, u_out, v_out, w_out)
 
       print*, 'run end'
 
-      call self%backend%get_field(u_out, self%u)
-      call self%backend%get_field(v_out, self%v)
-      call self%backend%get_field(w_out, self%w)
+      call self%backend%get_field_data(u_out, self%u, 1)
+      call self%backend%get_field_data(v_out, self%v, 1)
+      call self%backend%get_field_data(w_out, self%w, 1)
 
    end subroutine run
 

From af20ec25d20abba5c72804f94613c8fabd284eea Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Wed, 20 Mar 2024 18:33:54 +0000
Subject: [PATCH 09/11] Convert initial condition arrays into Cartesian
 ordering.

---
 src/solver.f90 | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/src/solver.f90 b/src/solver.f90
index fcf67049..64dd5f45 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -88,7 +88,7 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       integer :: dims(3)
 
       real(dp) :: x, y, z
-      integer :: nx, ny, nz, i, j, b, ka, kb, ix, iy, iz, sz
+      integer :: nx, ny, nz, i, j, k
 
       solver%backend => backend
       solver%time_integrator => time_integrator
@@ -102,7 +102,7 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       solver%w => solver%backend%allocator%get_block(DIR_X)
 
       ! Set initial conditions
-      dims(:) = solver%backend%allocator%xdims_padded(:)
+      dims(:) = solver%backend%allocator%cdims_padded(:)
       allocate(u_init(dims(1), dims(2), dims(3)))
       allocate(v_init(dims(1), dims(2), dims(3)))
       allocate(w_init(dims(1), dims(2), dims(3)))
@@ -112,30 +112,24 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) &
       solver%n_iters = globs%n_iters
       solver%n_output = globs%n_output
 
-      sz = dims(1)
       nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc
-      do ka = 1, nz
-         do kb = 1, ny/sz
-            do j = 1, nx
-               do i = 1, sz
-                  ! Mapping to ix, iy, iz depends on global group numbering
-                  ix = j; iy = (kb - 1)*sz + i; iz = ka
-                  x = (ix - 1)*globs%dx
-                  y = (iy - 1)*globs%dy
-                  z = (iz - 1)*globs%dz
-
-                  b = ka + (kb - 1)*xdirps%n
-                  u_init(i, j, b) = sin(x)*cos(y)*cos(z)
-                  v_init(i, j, b) =-cos(x)*sin(y)*cos(z)
-                  w_init(i, j, b) = 0
-               end do
+      do k = 1, nz
+         do j = 1, ny
+            do i = 1, nx
+               x = (i - 1)*globs%dx
+               y = (j - 1)*globs%dy
+               z = (k - 1)*globs%dz
+
+               u_init(i, j, k) = sin(x)*cos(y)*cos(z)
+               v_init(i, j, k) = -cos(x)*sin(y)*cos(z)
+               w_init(i, j, k) = 0
             end do
          end do
       end do
 
-      call solver%backend%set_field_data(solver%u, u_init, 1)
-      call solver%backend%set_field_data(solver%v, v_init, 1)
-      call solver%backend%set_field_data(solver%w, w_init, 1)
+      call solver%backend%set_field_data(solver%u, u_init)
+      call solver%backend%set_field_data(solver%v, v_init)
+      call solver%backend%set_field_data(solver%w, w_init)
 
       deallocate(u_init, v_init, w_init)
       print*, 'initial conditions are set'

From 224cf5c6887f78ec8e54abf1138158383520008c Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Wed, 20 Mar 2024 18:57:42 +0000
Subject: [PATCH 10/11] Convert output arrays into Cartesian.

---
 src/solver.f90   | 8 ++++----
 src/xcompact.f90 | 8 +++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/src/solver.f90 b/src/solver.f90
index 64dd5f45..977eb5d9 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -589,7 +589,7 @@ subroutine output(self, t, u_out)
       div_u => self%backend%allocator%get_block(DIR_Z)
 
       call self%divergence_v2p(div_u, self%u, self%v, self%w)
-      call self%backend%get_field_data(u_out, div_u, 3)
+      call self%backend%get_field_data(u_out, div_u)
 
       call self%backend%allocator%release_block(div_u)
 
@@ -665,9 +665,9 @@ subroutine run(self, u_out, v_out, w_out)
 
       print*, 'run end'
 
-      call self%backend%get_field_data(u_out, self%u, 1)
-      call self%backend%get_field_data(v_out, self%v, 1)
-      call self%backend%get_field_data(w_out, self%w, 1)
+      call self%backend%get_field_data(u_out, self%u)
+      call self%backend%get_field_data(v_out, self%v)
+      call self%backend%get_field_data(w_out, self%w)
 
    end subroutine run
 
diff --git a/src/xcompact.f90 b/src/xcompact.f90
index 06bf73c6..7a976a97 100644
--- a/src/xcompact.f90
+++ b/src/xcompact.f90
@@ -41,6 +41,7 @@ program xcompact
    real(dp), allocatable, dimension(:, :, :) :: u, v, w
 
    real(dp) :: t_start, t_end
+   integer :: dims(3)
    integer :: nrank, nproc, ierr
 
    call MPI_Init(ierr)
@@ -132,9 +133,10 @@ program xcompact
    print*, 'OpenMP backend instantiated'
 #endif
 
-   allocate(u(SZ, globs%nx_loc, globs%n_groups_x))
-   allocate(v(SZ, globs%nx_loc, globs%n_groups_x))
-   allocate(w(SZ, globs%nx_loc, globs%n_groups_x))
+   dims(:) = allocator%cdims_padded
+   allocate (u(dims(1), dims(2), dims(3)))
+   allocate (v(dims(1), dims(2), dims(3)))
+   allocate (w(dims(1), dims(2), dims(3)))
 
    time_integrator = time_intg_t(allocator=allocator, backend=backend)
    print*, 'time integrator instantiated'

From 955689919f6c6ea793e6917a7c39771c9b1229e4 Mon Sep 17 00:00:00 2001
From: Semih Akkurt <s.akkurt18@imperial.ac.uk>
Date: Thu, 21 Mar 2024 12:27:20 +0000
Subject: [PATCH 11/11] Apply suggestions from code review

Co-authored-by: Jamie J Quinn <JamieJQuinn@users.noreply.github.com>
---
 src/backend.f90 | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/backend.f90 b/src/backend.f90
index 73bfd01b..d6ce7839 100644
--- a/src/backend.f90
+++ b/src/backend.f90
@@ -206,12 +206,14 @@ end subroutine init_poisson_fft
 contains
 
    subroutine get_field_data(self, data, f, dir)
+   !! Extract data from field `f` optionally reordering into `dir` orientation.
+   !! To output in same orientation as `f`, use `call ...%get_field_data(data, f, f%dir)`
       implicit none
 
       class(base_backend_t) :: self
-      real(dp), dimension(:, :, :), intent(out) :: data
-      class(field_t), intent(in) :: f
-      integer, optional, intent(in) :: dir
+      real(dp), dimension(:, :, :), intent(out) :: data !! Output array
+      class(field_t), intent(in) :: f !! Field
+      integer, optional, intent(in) :: dir !! Desired orientation of output array (defaults to Cartesian)
 
       class(field_t), pointer :: f_temp
       integer :: direction, rdr_dir
@@ -241,9 +243,9 @@ subroutine set_field_data(self, f, data, dir)
       implicit none
 
       class(base_backend_t) :: self
-      class(field_t), intent(inout) :: f
-      real(dp), dimension(:, :, :), intent(in) :: data
-      integer, optional, intent(in) :: dir
+      class(field_t), intent(inout) :: f !! Field
+      real(dp), dimension(:, :, :), intent(in) :: data !! Input array
+      integer, optional, intent(in) :: dir !! Orientation of input array (defaults to Cartesian)
 
       class(field_t), pointer :: f_temp
       integer :: direction, rdr_dir