From f469f7171413947d72786eadd5cf79e6803f595b Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 14 Mar 2024 14:52:27 +0000 Subject: [PATCH 01/11] Add c2x and x2c reorder kernels in CUDA backend. --- src/cuda/kernels/reorder.f90 | 46 ++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90 index 11c74cac..52ccd0e5 100644 --- a/src/cuda/kernels/reorder.f90 +++ b/src/cuda/kernels/reorder.f90 @@ -6,6 +6,52 @@ module m_cuda_kernels_reorder contains + attributes(global) subroutine reorder_c2x(u_x, u_c, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_x + real(dp), device, intent(in), dimension(:, :, :) :: u_c + integer, value, intent(in) :: nz + + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) + + call syncthreads() + + ! copy into output array from shared + u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) = tile(j, i) + + end subroutine reorder_c2x + + attributes(global) subroutine reorder_x2c(u_c, u_x, nz) + implicit none + + real(dp), device, intent(out), dimension(:, :, :) :: u_c + real(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz + + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k + + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + + ! copy into shared + tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) + + call syncthreads() + + ! copy into output array from shared + u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) = tile(j, i) + + end subroutine reorder_x2c + attributes(global) subroutine reorder_x2y(u_y, u_x, nz) implicit none From 2dfd0d99ce7e51161a60e8b8bec3e5352cf17212 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 14 Mar 2024 14:53:19 +0000 Subject: [PATCH 02/11] Add a test for c2x and x2c reorder kernels. --- tests/cuda/test_cuda_reorder.f90 | 35 +++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90 index bacd3450..2c8f6800 100644 --- a/tests/cuda/test_cuda_reorder.f90 +++ b/tests/cuda/test_cuda_reorder.f90 @@ -6,13 +6,15 @@ program test_cuda_reorder use m_common, only: dp use m_cuda_common, only: SZ use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, & - reorder_y2z, reorder_z2x, reorder_z2y + reorder_y2z, reorder_z2x, reorder_z2y, & + reorder_c2x, reorder_x2c implicit none logical :: allpass = .true. - real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp - real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, u_temp_d + real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp, u_c + real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, & + u_temp_d, u_c_d integer :: n_block, i, n_iters integer :: nx, ny, nz, ndof @@ -46,6 +48,10 @@ program test_cuda_reorder allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block)) allocate (u_temp_d(SZ, nx, n_block)) + ! Cartesian order storage + allocate (u_c_d(nx, ny, nz)) + allocate (u_c(nx, ny, nz)) + ! set a random field call random_number(u_i) @@ -126,6 +132,29 @@ program test_cuda_reorder end if end if + ! x ordering into Cartesian ordering + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_x2c<<>>(u_c_d, u_i_d, nz) + + ! sanitise u_o_d + u_o_d = 0 + + ! Cartesian ordering back to x ordering + call reorder_c2x<<>>(u_o_d, u_c_d, nz) + u_o = u_o_d + + ! now both u_o and u_i in x ordering, compare them + norm_u = norm2(u_o - u_i) + if (nrank == 0) then + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder x2c and c2x... failed' + else + write(stderr, '(a)') 'Check reorder x2c and c2x... passed' + end if + end if + ! Now the performance checks call cpu_time(tstart) do i = 1, n_iters From 0fa4f78fd97d2105136621e0649349fd3f731f58 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 14 Mar 2024 14:57:01 +0000 Subject: [PATCH 03/11] Add performance checks for the c2x/x2c reorder kernels in the tests. --- tests/cuda/test_cuda_reorder.f90 | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90 index 2c8f6800..595d57fb 100644 --- a/tests/cuda/test_cuda_reorder.f90 +++ b/tests/cuda/test_cuda_reorder.f90 @@ -216,6 +216,24 @@ program test_cuda_reorder call checkperf(tend - tstart, n_iters, ndof, 2._dp) + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call cpu_time(tstart) + do i = 1, n_iters + call reorder_x2c<<>>(u_c_d, u_i_d, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + + call cpu_time(tstart) + do i = 1, n_iters + call reorder_c2x<<>>(u_o_d, u_c_d, nz) + end do + call cpu_time(tend) + + call checkperf(tend - tstart, n_iters, ndof, 2._dp) + if (allpass) then if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' else From 39c79dd521499b01a1bce2adcc7ae9f96709c853 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 14 Mar 2024 15:22:06 +0000 Subject: [PATCH 04/11] Remove MPI stuff from reorder tests. --- tests/cuda/test_cuda_reorder.f90 | 97 ++++++++++---------------------- 1 file changed, 31 insertions(+), 66 deletions(-) diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90 index 595d57fb..f948c35f 100644 --- a/tests/cuda/test_cuda_reorder.f90 +++ b/tests/cuda/test_cuda_reorder.f90 @@ -1,7 +1,6 @@ program test_cuda_reorder use iso_fortran_env, only: stderr => error_unit use cudafor - use mpi use m_common, only: dp use m_cuda_common, only: SZ @@ -18,25 +17,10 @@ program test_cuda_reorder integer :: n_block, i, n_iters integer :: nx, ny, nz, ndof - integer :: nrank, nproc, pprev, pnext - integer :: ierr, ndevs, devnum type(dim3) :: blocks, threads real(dp) :: norm_u, tol = 1d-8, tstart, tend - call MPI_Init(ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - - if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' - - ierr = cudaGetDeviceCount(ndevs) - ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin - ierr = cudaGetDevice(devnum) - - !print*, 'I am rank', nrank, 'I am running on device', devnum - pnext = modulo(nrank - nproc + 1, nproc) - pprev = modulo(nrank - 1, nproc) nx = 512; ny = 512; nz = 512 n_block = ny*nz/SZ @@ -72,13 +56,11 @@ program test_cuda_reorder ! and check whether it matches the initial random field norm_u = norm2(u_o - u_i) - if (nrank == 0) then - if ( norm_u > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check reorder x2y and y2x... failed' - else - write(stderr, '(a)') 'Check reorder x2y and y2x... passed' - end if + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder x2y and y2x... failed' + else + write(stderr, '(a)') 'Check reorder x2y and y2x... passed' end if ! we reuse u_o_d so zeroize in any case @@ -101,13 +83,11 @@ program test_cuda_reorder ! compare two y oriented fields norm_u = norm2(u_o - u_temp) - if (nrank == 0) then - if ( norm_u > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check reorder y2z and y2z... failed' - else - write(stderr, '(a)') 'Check reorder y2z and y2z... passed' - end if + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder y2z and y2z... failed' + else + write(stderr, '(a)') 'Check reorder y2z and y2z... passed' end if ! reorder initial random field into z orientation @@ -123,13 +103,11 @@ program test_cuda_reorder ! compare two z oriented fields norm_u = norm2(u_o - u_i) - if (nrank == 0) then - if ( norm_u > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check reorder x2z and z2x... failed' - else - write(stderr, '(a)') 'Check reorder x2z and z2x... passed' - end if + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder x2z and z2x... failed' + else + write(stderr, '(a)') 'Check reorder x2z and z2x... passed' end if ! x ordering into Cartesian ordering @@ -146,13 +124,17 @@ program test_cuda_reorder ! now both u_o and u_i in x ordering, compare them norm_u = norm2(u_o - u_i) - if (nrank == 0) then - if ( norm_u > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check reorder x2c and c2x... failed' - else - write(stderr, '(a)') 'Check reorder x2c and c2x... passed' - end if + if ( norm_u > tol ) then + allpass = .false. + write(stderr, '(a)') 'Check reorder x2c and c2x... failed' + else + write(stderr, '(a)') 'Check reorder x2c and c2x... passed' + end if + + if (allpass) then + write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' end if ! Now the performance checks @@ -234,14 +216,6 @@ program test_cuda_reorder call checkperf(tend - tstart, n_iters, ndof, 2._dp) - if (allpass) then - if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else - error stop 'SOME TESTS FAILED.' - end if - - call MPI_Finalize(ierr) - contains subroutine checkperf(t_tot, n_iters, ndof, consumed_bw) @@ -250,31 +224,22 @@ subroutine checkperf(t_tot, n_iters, ndof, consumed_bw) real(dp), intent(in) :: t_tot, consumed_bw integer, intent(in) :: n_iters, ndof - real(dp) :: achievedBW, devBW, achievedBWmax, achievedBWmin + real(dp) :: achievedBW, devBW integer :: ierr, memClockRt, memBusWidth ! BW utilisation and performance checks achievedBW = consumed_bw*n_iters*ndof*dp/t_tot - call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & - MPI_MAX, MPI_COMM_WORLD, ierr) - call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & - MPI_MIN, MPI_COMM_WORLD, ierr) - if (nrank == 0) then - print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s' - print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s' - end if + print'(a, f8.3, a)', 'Achieved BW: ', achievedBW/2**30, ' GiB/s' ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) ierr = cudaDeviceGetAttribute(memBusWidth, & cudaDevAttrGlobalMemoryBusWidth, 0) devBW = 2*memBusWidth/8._dp*memClockRt*1000 - if (nrank == 0) then - print'(a, f8.3, a)', 'Device BW: ', devBW/2**30, ' GiB/s' - print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/devBW*100 - print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/devBW*100 - end if + print'(a, f8.3, a)', 'Device BW: ', devBW/2**30, ' GiB/s' + print'(a, f5.2)', 'Effective BW util: %', achievedBW/devBW*100 + end subroutine checkperf end program test_cuda_reorder From 1353cd003576db64a38f1f29ebf5f55d40fe19bb Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 14 Mar 2024 15:30:33 +0000 Subject: [PATCH 05/11] Cleanup performance tests. --- tests/cuda/test_cuda_reorder.f90 | 35 +++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90 index f948c35f..d3bbf97c 100644 --- a/tests/cuda/test_cuda_reorder.f90 +++ b/tests/cuda/test_cuda_reorder.f90 @@ -138,66 +138,74 @@ program test_cuda_reorder end if ! Now the performance checks + + print*, 'Performance test: reorder_x2y' + blocks = dim3(nx/SZ, nz, ny/SZ) + threads = dim3(SZ, SZ, 1) call cpu_time(tstart) do i = 1, n_iters - blocks = dim3(nx/SZ, nz, ny/SZ) - threads = dim3(SZ, SZ, 1) call reorder_x2y<<>>(u_o_d, u_i_d, nz) end do call cpu_time(tend) call checkperf(tend - tstart, n_iters, ndof, 2._dp) + print*, 'Performance test: reorder_x2z' + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) call cpu_time(tstart) do i = 1, n_iters - blocks = dim3(nx, ny/SZ, 1) - threads = dim3(SZ, 1, 1) call reorder_x2z<<>>(u_o_d, u_i_d, nz) end do call cpu_time(tend) call checkperf(tend - tstart, n_iters, ndof, 2._dp) + print*, 'Performance test: reorder_y2x' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) call cpu_time(tstart) do i = 1, n_iters - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) call reorder_y2x<<>>(u_o_d, u_i_d, nz) end do call cpu_time(tend) call checkperf(tend - tstart, n_iters, ndof, 2._dp) + print*, 'Performance test: reorder_y2z' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) call cpu_time(tstart) do i = 1, n_iters - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) call reorder_y2z<<>>(u_o_d, u_i_d, nx, nz) end do call cpu_time(tend) call checkperf(tend - tstart, n_iters, ndof, 2._dp) + print*, 'Performance test: reorder_z2x' + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) call cpu_time(tstart) do i = 1, n_iters - blocks = dim3(nx, ny/SZ, 1) - threads = dim3(SZ, 1, 1) call reorder_z2x<<>>(u_o_d, u_i_d, nz) end do call cpu_time(tend) call checkperf(tend - tstart, n_iters, ndof, 2._dp) + print*, 'Performance test: reorder_z2y' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) call cpu_time(tstart) do i = 1, n_iters - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) call reorder_z2y<<>>(u_o_d, u_i_d, nx, nz) end do call cpu_time(tend) call checkperf(tend - tstart, n_iters, ndof, 2._dp) + print*, 'Performance test: reorder_x2c' blocks = dim3(nx/SZ, ny/SZ, nz) threads = dim3(SZ, SZ, 1) call cpu_time(tstart) @@ -208,6 +216,9 @@ program test_cuda_reorder call checkperf(tend - tstart, n_iters, ndof, 2._dp) + print*, 'Performance test: reorder_c2x' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) call cpu_time(tstart) do i = 1, n_iters call reorder_c2x<<>>(u_o_d, u_c_d, nz) From 1cc36e5c544026c1f77f680ba93bea05c8301446 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 14 Mar 2024 16:04:39 +0000 Subject: [PATCH 06/11] Make c2x/x2c reorder kernels available in CUDA backend. --- src/common.f90 | 3 ++- src/cuda/backend.f90 | 11 ++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/common.f90 b/src/common.f90 index 7830ac09..6232bcdd 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -5,7 +5,8 @@ module m_common real(dp), parameter :: pi = 4*atan(1.0_dp) integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & - RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32 + RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, & + RDR_C2X = 41, RDR_X2C = 14 integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4 integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 9112c62f..6d10123d 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -6,7 +6,8 @@ module m_cuda_backend use m_allocator, only: allocator_t, field_t use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + RDR_C2X, RDR_X2C use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t, tdsops_t @@ -450,6 +451,14 @@ subroutine reorder_cuda(self, u_o, u_i, direction) threads = dim3(SZ, SZ, 1) call reorder_z2y<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) + case (RDR_C2X) ! c2x + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_c2x<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_X2C) ! x2c + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_x2c<<>>(u_o_d, u_i_d, self%nz_loc) case default error stop 'Reorder direction is undefined.' end select From 3b376e27565abec8e306c240a17b8bc742543a54 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 20 Mar 2024 13:30:44 +0000 Subject: [PATCH 07/11] Add support for reorders for Y2C, Z2C, C2Y, C2Z in CUDA backend. --- src/common.f90 | 3 +- src/cuda/backend.f90 | 79 ++++++++++++++++++++++++++++++++++++++------ 2 files changed, 70 insertions(+), 12 deletions(-) diff --git a/src/common.f90 b/src/common.f90 index 6232bcdd..181781d7 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -6,7 +6,8 @@ module m_common integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, & - RDR_C2X = 41, RDR_X2C = 14 + RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, & + RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34 integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4 integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 6d10123d..5acadf6f 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -7,7 +7,8 @@ module m_cuda_backend use m_base_backend, only: base_backend_t use m_common, only: dp, globs_t, & RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & - RDR_C2X, RDR_X2C + RDR_C2X, RDR_C2Y, RDR_C2Z, RDR_X2C, RDR_Y2C, RDR_Z2C, & + DIR_X, DIR_Y, DIR_Z, DIR_C use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: dirps_t, tdsops_t @@ -20,7 +21,8 @@ module m_cuda_backend use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs use m_cuda_kernels_reorder, only: & reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, & - reorder_z2y, sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy + reorder_z2y, reorder_c2x, reorder_x2c, & + sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy implicit none @@ -418,47 +420,102 @@ subroutine reorder_cuda(self, u_o, u_i, direction) class(field_t), intent(in) :: u_i integer, intent(in) :: direction - real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d + real(dp), device, pointer, dimension(:, :, :) :: u_o_d, u_i_d, u_temp_d + class(field_t), pointer :: u_temp type(dim3) :: blocks, threads call resolve_field_t(u_o_d, u_o) call resolve_field_t(u_i_d, u_i) select case (direction) - case (RDR_X2Y) ! x2y + case (RDR_X2Y) blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) threads = dim3(SZ, SZ, 1) call reorder_x2y<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_X2Z) ! x2z + case (RDR_X2Z) blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) threads = dim3(SZ, 1, 1) call reorder_x2z<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_Y2X) ! y2x + case (RDR_Y2X) blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) call reorder_y2x<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_Y2Z) ! y2z + case (RDR_Y2Z) blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) call reorder_y2z<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) - case (RDR_Z2X) ! z2x + case (RDR_Z2X) blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) threads = dim3(SZ, 1, 1) call reorder_z2x<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_Z2Y) ! z2y + case (RDR_Z2Y) blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) call reorder_z2y<<>>(u_o_d, u_i_d, & self%nx_loc, self%nz_loc) - case (RDR_C2X) ! c2x + case (RDR_C2X) blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) call reorder_c2x<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_X2C) ! x2c + case (RDR_C2Y) + ! First reorder from C to X, then from X to Y + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_c2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) + threads = dim3(SZ, SZ, 1) + call reorder_x2y<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) + case (RDR_C2Z) + ! First reorder from C to X, then from X to Z + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_c2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_x2z<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) + case (RDR_X2C) blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) threads = dim3(SZ, SZ, 1) call reorder_x2c<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_Y2C) + ! First reorder from Y to X, then from X to C + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_y2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + call reorder_x2c<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) + case (RDR_Z2C) + ! First reorder from Z to X, then from X to C + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_z2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_x2c<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) case default error stop 'Reorder direction is undefined.' end select From 8128e80ffea281831907d18e88ebf4c42caa977e Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 20 Mar 2024 18:11:23 +0000 Subject: [PATCH 08/11] Improve get field and set field operations. --- src/backend.f90 | 103 +++++++++++++++++++++++++++++++++++-------- src/common.f90 | 13 ++++++ src/cuda/backend.f90 | 30 +++++-------- src/omp/backend.f90 | 30 +++++-------- src/solver.f90 | 14 +++--- 5 files changed, 129 insertions(+), 61 deletions(-) diff --git a/src/backend.f90 b/src/backend.f90 index e17970e5..73bfd01b 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -1,6 +1,6 @@ module m_base_backend use m_allocator, only: allocator_t, field_t - use m_common, only: dp + use m_common, only: dp, DIR_C, get_rdr_from_dirs use m_poisson_fft, only: poisson_fft_t use m_tdsops, only: tdsops_t, dirps_t @@ -36,10 +36,12 @@ module m_base_backend procedure(sum_intox), deferred :: sum_zintox procedure(vecadd), deferred :: vecadd procedure(scalar_product), deferred :: scalar_product - procedure(get_field), deferred :: get_field - procedure(set_field), deferred :: set_field + procedure(copy_data_to_f), deferred :: copy_data_to_f + procedure(copy_f_to_data), deferred :: copy_f_to_data procedure(alloc_tdsops), deferred :: alloc_tdsops procedure(init_poisson_fft), deferred :: init_poisson_fft + procedure :: get_field_data + procedure :: set_field_data end type base_backend_t abstract interface @@ -143,32 +145,31 @@ end function scalar_product end interface abstract interface - subroutine get_field(self, arr, f) - !! copy the specialist data structure from device or host back - !! to a regular 3D data structure. + subroutine copy_data_to_f(self, f, data) + !! Copy the specialist data structure from device or host back + !! to a regular 3D data array in host memory. import :: base_backend_t import :: dp import :: field_t implicit none - class(base_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: arr - class(field_t), intent(in) :: f - end subroutine get_field + class(base_backend_t), intent(inout) :: self + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: data + end subroutine copy_data_to_f - subroutine set_field(self, f, arr) - !! copy the initial condition stored in a regular 3D data - !! structure into the specialist data structure array on the - !! device or host. + subroutine copy_f_to_data(self, data, f) + !! Copy a regular 3D array in host memory into the specialist + !! data structure field that lives on device or host import :: base_backend_t import :: dp import :: field_t implicit none - class(base_backend_t) :: self - class(field_t), intent(inout) :: f - real(dp), dimension(:, :, :), intent(in) :: arr - end subroutine set_field + class(base_backend_t), intent(inout) :: self + real(dp), dimension(:, :, :), intent(out) :: data + class(field_t), intent(in) :: f + end subroutine copy_f_to_data end interface abstract interface @@ -202,4 +203,70 @@ subroutine init_poisson_fft(self, xdirps, ydirps, zdirps) end subroutine init_poisson_fft end interface +contains + + subroutine get_field_data(self, data, f, dir) + implicit none + + class(base_backend_t) :: self + real(dp), dimension(:, :, :), intent(out) :: data + class(field_t), intent(in) :: f + integer, optional, intent(in) :: dir + + class(field_t), pointer :: f_temp + integer :: direction, rdr_dir + + if (present(dir)) then + direction = dir + else + direction = DIR_C + end if + + ! Returns 0 if no reorder required + rdr_dir = get_rdr_from_dirs(direction, f%dir) + + ! Carry out a reorder if we need, and copy from field to data array + if (rdr_dir /= 0) then + f_temp => self%allocator%get_block(direction) + call self%reorder(f_temp, f, rdr_dir) + call self%copy_f_to_data(data, f_temp) + call self%allocator%release_block(f_temp) + else + call self%copy_f_to_data(data, f) + end if + + end subroutine get_field_data + + subroutine set_field_data(self, f, data, dir) + implicit none + + class(base_backend_t) :: self + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: data + integer, optional, intent(in) :: dir + + class(field_t), pointer :: f_temp + integer :: direction, rdr_dir + + if (present(dir)) then + direction = dir + else + direction = DIR_C + end if + + ! Returns 0 if no reorder required + rdr_dir = get_rdr_from_dirs(f%dir, direction) + + ! Carry out a reorder if we need, and copy from data array to field + if (rdr_dir /= 0) then + f_temp => self%allocator%get_block(direction) + call self%copy_data_to_f(f_temp, data) + call self%reorder(f, f_temp, rdr_dir) + call self%allocator%release_block(f_temp) + else + call self%copy_data_to_f(f, data) + end if + + end subroutine set_field_data + end module m_base_backend diff --git a/src/common.f90 b/src/common.f90 index 181781d7..a2ceab27 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -11,6 +11,12 @@ module m_common integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4 integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 + integer, protected :: & + rdr_map(4, 4) = reshape([0, RDR_X2Y, RDR_X2Z, RDR_X2C, & + RDR_Y2X, 0, RDR_Y2Z, RDR_Y2C, & + RDR_Z2X, RDR_Z2Y, 0, RDR_Z2C, & + RDR_C2X, RDR_C2Y, RDR_C2Z, 0], shape=[4, 4]) + type :: globs_t integer :: nx, ny, nz integer :: nx_loc, ny_loc, nz_loc @@ -81,4 +87,11 @@ subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, & end subroutine set_pprev_pnext + integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir) + !! Returns RDR_?2? value based on two direction inputs + integer, intent(in) :: dir_from, dir_to + + rdr_dir = rdr_map(dir_from, dir_to) + end function get_rdr_from_dirs + end module m_common diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index 5acadf6f..ff7f6dad 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -50,8 +50,8 @@ module m_cuda_backend procedure :: sum_zintox => sum_zintox_cuda procedure :: vecadd => vecadd_cuda procedure :: scalar_product => scalar_product_cuda - procedure :: set_field => set_field_cuda - procedure :: get_field => get_field_cuda + procedure :: copy_data_to_f => copy_data_to_f_cuda + procedure :: copy_f_to_data => copy_f_to_data_cuda procedure :: init_poisson_fft => init_cuda_poisson_fft procedure :: transeq_cuda_dist procedure :: transeq_cuda_thom @@ -630,27 +630,21 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) end subroutine copy_into_buffers - subroutine set_field_cuda(self, f, arr) - implicit none - - class(cuda_backend_t) :: self + subroutine copy_data_to_f_cuda(self, f, data) + class(cuda_backend_t), intent(inout) :: self class(field_t), intent(inout) :: f - real(dp), dimension(:, :, :), intent(in) :: arr - - select type(f); type is (cuda_field_t); f%data_d = arr; end select + real(dp), dimension(:, :, :), intent(inout) :: data - end subroutine set_field_cuda + select type(f); type is (cuda_field_t); f%data_d = data; end select + end subroutine copy_data_to_f_cuda - subroutine get_field_cuda(self, arr, f) - implicit none - - class(cuda_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: arr + subroutine copy_f_to_data_cuda(self, data, f) + class(cuda_backend_t), intent(inout) :: self + real(dp), dimension(:, :, :), intent(out) :: data class(field_t), intent(in) :: f - select type(f); type is (cuda_field_t); arr = f%data_d; end select - - end subroutine get_field_cuda + select type(f); type is (cuda_field_t); data = f%data_d; end select + end subroutine copy_f_to_data_cuda subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps) implicit none diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 20fcfc53..c1c8c2e1 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -34,8 +34,8 @@ module m_omp_backend procedure :: sum_zintox => sum_zintox_omp procedure :: vecadd => vecadd_omp procedure :: scalar_product => scalar_product_omp - procedure :: set_field => set_field_omp - procedure :: get_field => get_field_omp + procedure :: copy_data_to_f => copy_data_to_f_omp + procedure :: copy_f_to_data => copy_f_to_data_omp procedure :: init_poisson_fft => init_omp_poisson_fft procedure :: transeq_omp_dist end type omp_backend_t @@ -363,27 +363,21 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) end subroutine copy_into_buffers - subroutine set_field_omp(self, f, arr) - implicit none - - class(omp_backend_t) :: self + subroutine copy_data_to_f_omp(self, f, data) + class(omp_backend_t), intent(inout) :: self class(field_t), intent(inout) :: f - real(dp), dimension(:, :, :), intent(in) :: arr - - f%data = arr + real(dp), dimension(:, :, :), intent(in) :: data - end subroutine set_field_omp + f%data = data + end subroutine copy_data_to_f_omp - subroutine get_field_omp(self, arr, f) - implicit none - - class(omp_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: arr + subroutine copy_f_to_data_omp(self, data, f) + class(omp_backend_t), intent(inout) :: self + real(dp), dimension(:, :, :), intent(out) :: data class(field_t), intent(in) :: f - arr = f%data - - end subroutine get_field_omp + data = f%data + end subroutine copy_f_to_data_omp subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) implicit none diff --git a/src/solver.f90 b/src/solver.f90 index 2e1fa384..fcf67049 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -133,9 +133,9 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & end do end do - call solver%backend%set_field(solver%u, u_init) - call solver%backend%set_field(solver%v, v_init) - call solver%backend%set_field(solver%w, w_init) + call solver%backend%set_field_data(solver%u, u_init, 1) + call solver%backend%set_field_data(solver%v, v_init, 1) + call solver%backend%set_field_data(solver%w, w_init, 1) deallocate(u_init, v_init, w_init) print*, 'initial conditions are set' @@ -595,7 +595,7 @@ subroutine output(self, t, u_out) div_u => self%backend%allocator%get_block(DIR_Z) call self%divergence_v2p(div_u, self%u, self%v, self%w) - call self%backend%get_field(u_out, div_u) + call self%backend%get_field_data(u_out, div_u, 3) call self%backend%allocator%release_block(div_u) @@ -671,9 +671,9 @@ subroutine run(self, u_out, v_out, w_out) print*, 'run end' - call self%backend%get_field(u_out, self%u) - call self%backend%get_field(v_out, self%v) - call self%backend%get_field(w_out, self%w) + call self%backend%get_field_data(u_out, self%u, 1) + call self%backend%get_field_data(v_out, self%v, 1) + call self%backend%get_field_data(w_out, self%w, 1) end subroutine run From af20ec25d20abba5c72804f94613c8fabd284eea Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 20 Mar 2024 18:33:54 +0000 Subject: [PATCH 09/11] Convert initial condition arrays into Cartesian ordering. --- src/solver.f90 | 36 +++++++++++++++--------------------- 1 file changed, 15 insertions(+), 21 deletions(-) diff --git a/src/solver.f90 b/src/solver.f90 index fcf67049..64dd5f45 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -88,7 +88,7 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & integer :: dims(3) real(dp) :: x, y, z - integer :: nx, ny, nz, i, j, b, ka, kb, ix, iy, iz, sz + integer :: nx, ny, nz, i, j, k solver%backend => backend solver%time_integrator => time_integrator @@ -102,7 +102,7 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & solver%w => solver%backend%allocator%get_block(DIR_X) ! Set initial conditions - dims(:) = solver%backend%allocator%xdims_padded(:) + dims(:) = solver%backend%allocator%cdims_padded(:) allocate(u_init(dims(1), dims(2), dims(3))) allocate(v_init(dims(1), dims(2), dims(3))) allocate(w_init(dims(1), dims(2), dims(3))) @@ -112,30 +112,24 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & solver%n_iters = globs%n_iters solver%n_output = globs%n_output - sz = dims(1) nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc - do ka = 1, nz - do kb = 1, ny/sz - do j = 1, nx - do i = 1, sz - ! Mapping to ix, iy, iz depends on global group numbering - ix = j; iy = (kb - 1)*sz + i; iz = ka - x = (ix - 1)*globs%dx - y = (iy - 1)*globs%dy - z = (iz - 1)*globs%dz - - b = ka + (kb - 1)*xdirps%n - u_init(i, j, b) = sin(x)*cos(y)*cos(z) - v_init(i, j, b) =-cos(x)*sin(y)*cos(z) - w_init(i, j, b) = 0 - end do + do k = 1, nz + do j = 1, ny + do i = 1, nx + x = (i - 1)*globs%dx + y = (j - 1)*globs%dy + z = (k - 1)*globs%dz + + u_init(i, j, k) = sin(x)*cos(y)*cos(z) + v_init(i, j, k) = -cos(x)*sin(y)*cos(z) + w_init(i, j, k) = 0 end do end do end do - call solver%backend%set_field_data(solver%u, u_init, 1) - call solver%backend%set_field_data(solver%v, v_init, 1) - call solver%backend%set_field_data(solver%w, w_init, 1) + call solver%backend%set_field_data(solver%u, u_init) + call solver%backend%set_field_data(solver%v, v_init) + call solver%backend%set_field_data(solver%w, w_init) deallocate(u_init, v_init, w_init) print*, 'initial conditions are set' From 224cf5c6887f78ec8e54abf1138158383520008c Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Wed, 20 Mar 2024 18:57:42 +0000 Subject: [PATCH 10/11] Convert output arrays into Cartesian. --- src/solver.f90 | 8 ++++---- src/xcompact.f90 | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/solver.f90 b/src/solver.f90 index 64dd5f45..977eb5d9 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -589,7 +589,7 @@ subroutine output(self, t, u_out) div_u => self%backend%allocator%get_block(DIR_Z) call self%divergence_v2p(div_u, self%u, self%v, self%w) - call self%backend%get_field_data(u_out, div_u, 3) + call self%backend%get_field_data(u_out, div_u) call self%backend%allocator%release_block(div_u) @@ -665,9 +665,9 @@ subroutine run(self, u_out, v_out, w_out) print*, 'run end' - call self%backend%get_field_data(u_out, self%u, 1) - call self%backend%get_field_data(v_out, self%v, 1) - call self%backend%get_field_data(w_out, self%w, 1) + call self%backend%get_field_data(u_out, self%u) + call self%backend%get_field_data(v_out, self%v) + call self%backend%get_field_data(w_out, self%w) end subroutine run diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 06bf73c6..7a976a97 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -41,6 +41,7 @@ program xcompact real(dp), allocatable, dimension(:, :, :) :: u, v, w real(dp) :: t_start, t_end + integer :: dims(3) integer :: nrank, nproc, ierr call MPI_Init(ierr) @@ -132,9 +133,10 @@ program xcompact print*, 'OpenMP backend instantiated' #endif - allocate(u(SZ, globs%nx_loc, globs%n_groups_x)) - allocate(v(SZ, globs%nx_loc, globs%n_groups_x)) - allocate(w(SZ, globs%nx_loc, globs%n_groups_x)) + dims(:) = allocator%cdims_padded + allocate (u(dims(1), dims(2), dims(3))) + allocate (v(dims(1), dims(2), dims(3))) + allocate (w(dims(1), dims(2), dims(3))) time_integrator = time_intg_t(allocator=allocator, backend=backend) print*, 'time integrator instantiated' From 955689919f6c6ea793e6917a7c39771c9b1229e4 Mon Sep 17 00:00:00 2001 From: Semih Akkurt Date: Thu, 21 Mar 2024 12:27:20 +0000 Subject: [PATCH 11/11] Apply suggestions from code review Co-authored-by: Jamie J Quinn --- src/backend.f90 | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/backend.f90 b/src/backend.f90 index 73bfd01b..d6ce7839 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -206,12 +206,14 @@ end subroutine init_poisson_fft contains subroutine get_field_data(self, data, f, dir) + !! Extract data from field `f` optionally reordering into `dir` orientation. + !! To output in same orientation as `f`, use `call ...%get_field_data(data, f, f%dir)` implicit none class(base_backend_t) :: self - real(dp), dimension(:, :, :), intent(out) :: data - class(field_t), intent(in) :: f - integer, optional, intent(in) :: dir + real(dp), dimension(:, :, :), intent(out) :: data !! Output array + class(field_t), intent(in) :: f !! Field + integer, optional, intent(in) :: dir !! Desired orientation of output array (defaults to Cartesian) class(field_t), pointer :: f_temp integer :: direction, rdr_dir @@ -241,9 +243,9 @@ subroutine set_field_data(self, f, data, dir) implicit none class(base_backend_t) :: self - class(field_t), intent(inout) :: f - real(dp), dimension(:, :, :), intent(in) :: data - integer, optional, intent(in) :: dir + class(field_t), intent(inout) :: f !! Field + real(dp), dimension(:, :, :), intent(in) :: data !! Input array + integer, optional, intent(in) :: dir !! Orientation of input array (defaults to Cartesian) class(field_t), pointer :: f_temp integer :: direction, rdr_dir