diff --git a/script/indent_continuation.py b/script/indent_continuation.py new file mode 100755 index 00000000..4316fb38 --- /dev/null +++ b/script/indent_continuation.py @@ -0,0 +1,29 @@ +#!/usr/bin/env python3 +# Indent continuation lines for fortran files read through stdin +# usage: +# cat file.f90 | indent_continuation.py +# +import re +import sys + +indent_chars = [ ":", "=", "::" ] +indent_regex = re.compile(r"^\ *") +continuation_regex = re.compile(r'&\ *$') +indent_next = False +current_indent = 0 +parenthesis = 0 + +for line in sys.stdin: + if not indent_next: + for char in indent_chars: + if char in line: + current_indent = line.find(char) + len(char) + 1 + + if indent_next and parenthesis == 0: + line = indent_regex.sub(current_indent*" ", line) + indent_next = continuation_regex.search(line) + print(line, end="") + + parenthesis += line.count("(") - line.count(")")\ + + line.count("[") - line.count("]") + diff --git a/src/allocator.f90 b/src/allocator.f90 index c53de160..c07274f2 100644 --- a/src/allocator.f90 +++ b/src/allocator.f90 @@ -1,37 +1,37 @@ -module m_allocator - use iso_fortran_env, only: stderr => error_unit - - use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C - - implicit none - - type :: allocator_t - !! An instance of type allocator_t is responsible for the - !! maintenance of a linked list of instances of equal size - !! [[m_allocator(module):field_t(type)]] objects: - !! - !! ``` - !! ---- ---- ---- ---- ---- ---- - !! ...-->|id=1|data|next|-->|id=0|data|next|-->null() - !! ---- ---- ---- ---- ---- ---- - !! ``` - !! - !! the last block's `next` pointer being non associated. - !! - !! User code can request access to a memory block by using the - !! type bound procedure - !! [[m_allocator(module):get_block(function)]]. If the list is - !! not empty, a pointer to the first block on the list is - !! returned and the block is detached from the list. If the list - !! is empty (i.e. all initially allocated blocks are currently - !! referenced to) then a new block is allocated before a pointer - !! to it is returned. - !! - !! In order to reuse memory it is important that user code - !! release blocks when they are not needed anymore. This is done - !! by calling the type bound procedure - !! [[m_allocator(module):release_block(subroutine)]]. The - !! released block is then pushed in front of the block list. + module m_allocator + use iso_fortran_env, only: stderr => error_unit + + use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, DIR_C + + implicit none + + type :: allocator_t + !! An instance of type allocator_t is responsible for the + !! maintenance of a linked list of instances of equal size + !! [[m_allocator(module):field_t(type)]] objects: + !! + !! ``` + !! ---- ---- ---- ---- ---- ---- + !! ...-->|id=1|data|next|-->|id=0|data|next|-->null() + !! ---- ---- ---- ---- ---- ---- + !! ``` + !! + !! the last block's `next` pointer being non associated. + !! + !! User code can request access to a memory block by using the + !! type bound procedure + !! [[m_allocator(module):get_block(function)]]. If the list is + !! not empty, a pointer to the first block on the list is + !! returned and the block is detached from the list. If the list + !! is empty (i.e. all initially allocated blocks are currently + !! referenced to) then a new block is allocated before a pointer + !! to it is returned. + !! + !! In order to reuse memory it is important that user code + !! release blocks when they are not needed anymore. This is done + !! by calling the type bound procedure + !! [[m_allocator(module):release_block(subroutine)]]. The + !! released block is then pushed in front of the block list. integer :: ngrid, sz !> The id for the next allocated block. This counter is @@ -45,45 +45,45 @@ module m_allocator !> the list is empty ! TODO: Rename first to head class(field_t), pointer :: first => null() - contains + contains procedure :: get_block procedure :: release_block procedure :: create_block procedure :: get_block_ids procedure :: destroy - end type allocator_t + end type allocator_t - interface allocator_t + interface allocator_t module procedure allocator_init - end interface allocator_t - - type :: field_t - !! Memory block type holding both a data field and a pointer - !! to the next block. The `field_t` type also holds a integer - !! `refcount` that counts the number of references to this - !! field. User code is currently responsible for incrementing - !! the reference count. + end interface allocator_t + + type :: field_t + !! Memory block type holding both a data field and a pointer + !! to the next block. The `field_t` type also holds a integer + !! `refcount` that counts the number of references to this + !! field. User code is currently responsible for incrementing + !! the reference count. class(field_t), pointer :: next real(dp), pointer, private :: p_data(:) real(dp), pointer, contiguous :: data(:, :, :) integer :: dir integer :: refcount = 0 integer :: id !! An integer identifying the memory block. - contains + contains procedure :: set_shape - end type field_t + end type field_t - interface field_t + interface field_t module procedure field_init - end interface field_t + end interface field_t - type :: flist_t + type :: flist_t class(field_t), pointer :: ptr - end type flist_t + end type flist_t -contains + contains - function field_init(ngrid, next, id) result(f) + function field_init(ngrid, next, id) result(f) integer, intent(in) :: ngrid, id type(field_t), pointer, intent(in) :: next type(field_t) :: f @@ -92,9 +92,9 @@ function field_init(ngrid, next, id) result(f) f%refcount = 0 f%next => next f%id = id - end function field_init + end function field_init - subroutine set_shape(self, dims) + subroutine set_shape(self, dims) implicit none class(field_t) :: self @@ -102,9 +102,9 @@ subroutine set_shape(self, dims) self%data(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data - end subroutine set_shape + end subroutine set_shape - function allocator_init(nx, ny, nz, sz) result(allocator) + function allocator_init(nx, ny, nz, sz) result(allocator) integer, intent(in) :: nx, ny, nz, sz type(allocator_t) :: allocator @@ -123,11 +123,11 @@ function allocator_init(nx, ny, nz, sz) result(allocator) allocator%ydims_padded = [sz, ny_padded, nx_padded*nz_padded/sz] allocator%zdims_padded = [sz, nz_padded, nx_padded*ny_padded/sz] allocator%cdims_padded = [nx_padded, ny_padded, nz_padded] - end function allocator_init + end function allocator_init - function create_block(self, next) result(ptr) - !! Allocate memory for a new block and return a pointer to a new - !! [[m_allocator(module):field_t(type)]] object. + function create_block(self, next) result(ptr) + !! Allocate memory for a new block and return a pointer to a new + !! [[m_allocator(module):field_t(type)]] object. class(allocator_t), intent(inout) :: self type(field_t), pointer, intent(in) :: next type(field_t), pointer :: newblock @@ -136,18 +136,18 @@ function create_block(self, next) result(ptr) allocate (newblock) newblock = field_t(self%ngrid, next, id=self%next_id) ptr => newblock - end function create_block - - function get_block(self, dir) result(handle) - !! Return a pointer to the first available memory block, i.e. the - !! current head of the block list. If the list is empty, allocate - !! a new block with [[m_allocator(module):create_block(function)]] - !! first. - !! - !! Example - !! ``` - !! f%data => get_block() - !! ``` + end function create_block + + function get_block(self, dir) result(handle) + !! Return a pointer to the first available memory block, i.e. the + !! current head of the block list. If the list is empty, allocate + !! a new block with [[m_allocator(module):create_block(function)]] + !! first. + !! + !! Example + !! ``` + !! f%data => get_block() + !! ``` class(allocator_t), intent(inout) :: self class(field_t), pointer :: handle integer, intent(in) :: dir @@ -155,9 +155,9 @@ function get_block(self, dir) result(handle) ! If the list is empty, allocate a new block before returning a ! pointer to it. if (.not. associated(self%first)) then - ! Construct a field_t. This effectively allocates - ! storage space. - self%first => self%create_block(next=self%first) + ! Construct a field_t. This effectively allocates + ! storage space. + self%first => self%create_block(next=self%first) end if handle => self%first self%first => self%first%next ! 2nd block becomes head block @@ -168,53 +168,53 @@ function get_block(self, dir) result(handle) ! Set dims based on direction select case(dir) - case (DIR_X) - dims = self%xdims_padded - case (DIR_Y) - dims = self%ydims_padded - case (DIR_Z) - dims = self%zdims_padded - case (DIR_C) - dims = self%cdims_padded - case default - error stop 'Undefined direction, allocator cannot provide a shape.' + case (DIR_X) + dims = self%xdims_padded + case (DIR_Y) + dims = self%ydims_padded + case (DIR_Z) + dims = self%zdims_padded + case (DIR_C) + dims = self%cdims_padded + case default + error stop 'Undefined direction, allocator cannot provide a shape.' end select ! Apply bounds remapping based on requested direction call handle%set_shape(dims) - end function get_block + end function get_block - subroutine release_block(self, handle) - !! Release memory block pointed to by HANDLE to the block list. - !! It is pushed to the front of the block list, in other words it - !! is made the head block. + subroutine release_block(self, handle) + !! Release memory block pointed to by HANDLE to the block list. + !! It is pushed to the front of the block list, in other words it + !! is made the head block. class(allocator_t), intent(inout) :: self class(field_t), pointer :: handle handle%next => self%first self%first => handle - end subroutine release_block - - subroutine destroy(self) - !! Go through the block list from head to tail, deallocating each - !! memory block in turn. Deallocation of a - !! [[m_allocator(module):field_t(type)]] object automatically - !! deallocates its internal allocatable - !! [[field_t(type):data(variable)]] array. + end subroutine release_block + + subroutine destroy(self) + !! Go through the block list from head to tail, deallocating each + !! memory block in turn. Deallocation of a + !! [[m_allocator(module):field_t(type)]] object automatically + !! deallocates its internal allocatable + !! [[field_t(type):data(variable)]] array. class(allocator_t), intent(inout) :: self type(field_t), pointer :: current do - if (.not. associated(self%first)) exit - current => self%first - self%first => self%first%next - deallocate (current) - self%next_id = self%next_id - 1 + if (.not. associated(self%first)) exit + current => self%first + self%first => self%first%next + deallocate (current) + self%next_id = self%next_id - 1 end do - end subroutine destroy + end subroutine destroy - function get_block_ids(self) - !! Utility function that returns a array made of the `id` of the - !! block currently in the block list. Return the array [0] if - !! block list is empty. + function get_block_ids(self) + !! Utility function that returns a array made of the `id` of the + !! block currently in the block list. Return the array [0] if + !! block list is empty. ! TODO: Block indices should start at 1 or return [-1] in case of ! empty block list. class(allocator_t), intent(inout) :: self @@ -224,17 +224,17 @@ function get_block_ids(self) current => self%first if (.not. associated(current)) then - get_block_ids = [0] + get_block_ids = [0] else - i = current%id - get_block_ids = [current%id] - do - if (.not. associated(current%next)) exit - i = current%next%id - get_block_ids = [get_block_ids, current%next%id] - current => current%next - end do + i = current%id + get_block_ids = [current%id] + do + if (.not. associated(current%next)) exit + i = current%next%id + get_block_ids = [get_block_ids, current%next%id] + current => current%next + end do end if - end function get_block_ids + end function get_block_ids -end module m_allocator + end module m_allocator diff --git a/src/backend.f90 b/src/backend.f90 index d6ce7839..1b443d22 100644 --- a/src/backend.f90 +++ b/src/backend.f90 @@ -1,12 +1,12 @@ -module m_base_backend - use m_allocator, only: allocator_t, field_t - use m_common, only: dp, DIR_C, get_rdr_from_dirs - use m_poisson_fft, only: poisson_fft_t - use m_tdsops, only: tdsops_t, dirps_t + module m_base_backend + use m_allocator, only: allocator_t, field_t + use m_common, only: dp, DIR_C, get_rdr_from_dirs + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: tdsops_t, dirps_t - implicit none + implicit none - type, abstract :: base_backend_t + type, abstract :: base_backend_t !! base_backend class defines all the abstract operations that the !! solver class requires. !! @@ -26,7 +26,7 @@ module m_base_backend class(allocator_t), pointer :: allocator class(dirps_t), pointer :: xdirps, ydirps, zdirps class(poisson_fft_t), pointer :: poisson_fft - contains + contains procedure(transeq_ders), deferred :: transeq_x procedure(transeq_ders), deferred :: transeq_y procedure(transeq_ders), deferred :: transeq_z @@ -42,172 +42,172 @@ module m_base_backend procedure(init_poisson_fft), deferred :: init_poisson_fft procedure :: get_field_data procedure :: set_field_data - end type base_backend_t + end type base_backend_t - abstract interface + abstract interface subroutine transeq_ders(self, du, dv, dw, u, v, w, dirps) - !! transeq equation obtains the derivatives direction by - !! direction, and the exact algorithm used to obtain these - !! derivatives are decided at runtime. Backend implementations - !! are responsible from directing calls to transeq_ders into - !! the correct algorithm. - import :: base_backend_t - import :: field_t - import :: dirps_t - implicit none - - class(base_backend_t) :: self - class(field_t), intent(inout) :: du, dv, dw - class(field_t), intent(in) :: u, v, w - type(dirps_t), intent(in) :: dirps + !! transeq equation obtains the derivatives direction by + !! direction, and the exact algorithm used to obtain these + !! derivatives are decided at runtime. Backend implementations + !! are responsible from directing calls to transeq_ders into + !! the correct algorithm. + import :: base_backend_t + import :: field_t + import :: dirps_t + implicit none + + class(base_backend_t) :: self + class(field_t), intent(inout) :: du, dv, dw + class(field_t), intent(in) :: u, v, w + type(dirps_t), intent(in) :: dirps end subroutine transeq_ders - end interface + end interface - abstract interface + abstract interface subroutine tds_solve(self, du, u, dirps, tdsops) - !! transeq equation obtains the derivatives direction by - !! direction, and the exact algorithm used to obtain these - !! derivatives are decided at runtime. Backend implementations - !! are responsible from directing calls to transeq_ders into - !! the correct algorithm. - import :: base_backend_t - import :: field_t - import :: dirps_t - import :: tdsops_t - implicit none - - class(base_backend_t) :: self - class(field_t), intent(inout) :: du - class(field_t), intent(in) :: u - type(dirps_t), intent(in) :: dirps - class(tdsops_t), intent(in) :: tdsops + !! transeq equation obtains the derivatives direction by + !! direction, and the exact algorithm used to obtain these + !! derivatives are decided at runtime. Backend implementations + !! are responsible from directing calls to transeq_ders into + !! the correct algorithm. + import :: base_backend_t + import :: field_t + import :: dirps_t + import :: tdsops_t + implicit none + + class(base_backend_t) :: self + class(field_t), intent(inout) :: du + class(field_t), intent(in) :: u + type(dirps_t), intent(in) :: dirps + class(tdsops_t), intent(in) :: tdsops end subroutine tds_solve - end interface + end interface - abstract interface + abstract interface subroutine reorder(self, u_, u, direction) - !! reorder subroutines are straightforward, they rearrange - !! data into our specialist data structure so that regardless - !! of the direction tridiagonal systems are solved efficiently - !! and fast. - import :: base_backend_t - import :: field_t - implicit none - - class(base_backend_t) :: self - class(field_t), intent(inout) :: u_ - class(field_t), intent(in) :: u - integer, intent(in) :: direction + !! reorder subroutines are straightforward, they rearrange + !! data into our specialist data structure so that regardless + !! of the direction tridiagonal systems are solved efficiently + !! and fast. + import :: base_backend_t + import :: field_t + implicit none + + class(base_backend_t) :: self + class(field_t), intent(inout) :: u_ + class(field_t), intent(in) :: u + integer, intent(in) :: direction end subroutine reorder - end interface + end interface - abstract interface + abstract interface subroutine sum_intox(self, u, u_) - !! sum9into3 subroutine combines all the directional velocity - !! derivatives into the corresponding x directional fields. - import :: base_backend_t - import :: field_t - implicit none - - class(base_backend_t) :: self - class(field_t), intent(inout) :: u - class(field_t), intent(in) :: u_ + !! sum9into3 subroutine combines all the directional velocity + !! derivatives into the corresponding x directional fields. + import :: base_backend_t + import :: field_t + implicit none + + class(base_backend_t) :: self + class(field_t), intent(inout) :: u + class(field_t), intent(in) :: u_ end subroutine sum_intox - end interface + end interface - abstract interface + abstract interface subroutine vecadd(self, a, x, b, y) - !! adds two vectors together: y = a*x + b*y - import :: base_backend_t - import :: dp - import :: field_t - implicit none - - class(base_backend_t) :: self - real(dp), intent(in) :: a - class(field_t), intent(in) :: x - real(dp), intent(in) :: b - class(field_t), intent(inout) :: y + !! adds two vectors together: y = a*x + b*y + import :: base_backend_t + import :: dp + import :: field_t + implicit none + + class(base_backend_t) :: self + real(dp), intent(in) :: a + class(field_t), intent(in) :: x + real(dp), intent(in) :: b + class(field_t), intent(inout) :: y end subroutine vecadd - end interface + end interface - abstract interface + abstract interface real(dp) function scalar_product(self, x, y) result(s) - !! Calculates the scalar product of two input fields - import :: base_backend_t - import :: dp - import :: field_t - implicit none - - class(base_backend_t) :: self - class(field_t), intent(in) :: x, y + !! Calculates the scalar product of two input fields + import :: base_backend_t + import :: dp + import :: field_t + implicit none + + class(base_backend_t) :: self + class(field_t), intent(in) :: x, y end function scalar_product - end interface + end interface - abstract interface + abstract interface subroutine copy_data_to_f(self, f, data) - !! Copy the specialist data structure from device or host back - !! to a regular 3D data array in host memory. - import :: base_backend_t - import :: dp - import :: field_t - implicit none - - class(base_backend_t), intent(inout) :: self - class(field_t), intent(inout) :: f - real(dp), dimension(:, :, :), intent(in) :: data + !! Copy the specialist data structure from device or host back + !! to a regular 3D data array in host memory. + import :: base_backend_t + import :: dp + import :: field_t + implicit none + + class(base_backend_t), intent(inout) :: self + class(field_t), intent(inout) :: f + real(dp), dimension(:, :, :), intent(in) :: data end subroutine copy_data_to_f subroutine copy_f_to_data(self, data, f) - !! Copy a regular 3D array in host memory into the specialist - !! data structure field that lives on device or host - import :: base_backend_t - import :: dp - import :: field_t - implicit none - - class(base_backend_t), intent(inout) :: self - real(dp), dimension(:, :, :), intent(out) :: data - class(field_t), intent(in) :: f + !! Copy a regular 3D array in host memory into the specialist + !! data structure field that lives on device or host + import :: base_backend_t + import :: dp + import :: field_t + implicit none + + class(base_backend_t), intent(inout) :: self + real(dp), dimension(:, :, :), intent(out) :: data + class(field_t), intent(in) :: f end subroutine copy_f_to_data - end interface + end interface - abstract interface + abstract interface subroutine alloc_tdsops(self, tdsops, n, dx, operation, scheme, n_halo, & - from_to, bc_start, bc_end, sym, c_nu, nu0_nu) - import :: base_backend_t - import :: dp - import :: tdsops_t - implicit none - - class(base_backend_t) :: self - class(tdsops_t), allocatable, intent(inout) :: tdsops - integer, intent(in) :: n - real(dp), intent(in) :: dx - character(*), intent(in) :: operation, scheme - integer, optional, intent(in) :: n_halo - character(*), optional, intent(in) :: from_to, bc_start, bc_end - logical, optional, intent(in) :: sym - real(dp), optional, intent(in) :: c_nu, nu0_nu + from_to, bc_start, bc_end, sym, c_nu, nu0_nu) + import :: base_backend_t + import :: dp + import :: tdsops_t + implicit none + + class(base_backend_t) :: self + class(tdsops_t), allocatable, intent(inout) :: tdsops + integer, intent(in) :: n + real(dp), intent(in) :: dx + character(*), intent(in) :: operation, scheme + integer, optional, intent(in) :: n_halo + character(*), optional, intent(in) :: from_to, bc_start, bc_end + logical, optional, intent(in) :: sym + real(dp), optional, intent(in) :: c_nu, nu0_nu end subroutine alloc_tdsops - end interface + end interface - abstract interface + abstract interface subroutine init_poisson_fft(self, xdirps, ydirps, zdirps) - import :: base_backend_t - import :: dirps_t - implicit none + import :: base_backend_t + import :: dirps_t + implicit none - class(base_backend_t) :: self - type(dirps_t), intent(in) :: xdirps, ydirps, zdirps + class(base_backend_t) :: self + type(dirps_t), intent(in) :: xdirps, ydirps, zdirps end subroutine init_poisson_fft - end interface + end interface -contains + contains - subroutine get_field_data(self, data, f, dir) - !! Extract data from field `f` optionally reordering into `dir` orientation. - !! To output in same orientation as `f`, use `call ...%get_field_data(data, f, f%dir)` + subroutine get_field_data(self, data, f, dir) + !! Extract data from field `f` optionally reordering into `dir` orientation. + !! To output in same orientation as `f`, use `call ...%get_field_data(data, f, f%dir)` implicit none class(base_backend_t) :: self @@ -219,9 +219,9 @@ subroutine get_field_data(self, data, f, dir) integer :: direction, rdr_dir if (present(dir)) then - direction = dir + direction = dir else - direction = DIR_C + direction = DIR_C end if ! Returns 0 if no reorder required @@ -229,17 +229,17 @@ subroutine get_field_data(self, data, f, dir) ! Carry out a reorder if we need, and copy from field to data array if (rdr_dir /= 0) then - f_temp => self%allocator%get_block(direction) - call self%reorder(f_temp, f, rdr_dir) - call self%copy_f_to_data(data, f_temp) - call self%allocator%release_block(f_temp) + f_temp => self%allocator%get_block(direction) + call self%reorder(f_temp, f, rdr_dir) + call self%copy_f_to_data(data, f_temp) + call self%allocator%release_block(f_temp) else - call self%copy_f_to_data(data, f) + call self%copy_f_to_data(data, f) end if - end subroutine get_field_data + end subroutine get_field_data - subroutine set_field_data(self, f, data, dir) + subroutine set_field_data(self, f, data, dir) implicit none class(base_backend_t) :: self @@ -251,9 +251,9 @@ subroutine set_field_data(self, f, data, dir) integer :: direction, rdr_dir if (present(dir)) then - direction = dir + direction = dir else - direction = DIR_C + direction = DIR_C end if ! Returns 0 if no reorder required @@ -261,14 +261,14 @@ subroutine set_field_data(self, f, data, dir) ! Carry out a reorder if we need, and copy from data array to field if (rdr_dir /= 0) then - f_temp => self%allocator%get_block(direction) - call self%copy_data_to_f(f_temp, data) - call self%reorder(f, f_temp, rdr_dir) - call self%allocator%release_block(f_temp) + f_temp => self%allocator%get_block(direction) + call self%copy_data_to_f(f_temp, data) + call self%reorder(f, f_temp, rdr_dir) + call self%allocator%release_block(f_temp) else - call self%copy_data_to_f(f, data) + call self%copy_data_to_f(f, data) end if - end subroutine set_field_data + end subroutine set_field_data -end module m_base_backend + end module m_base_backend diff --git a/src/common.f90 b/src/common.f90 index a2ceab27..992dd43f 100644 --- a/src/common.f90 +++ b/src/common.f90 @@ -1,23 +1,23 @@ -module m_common - implicit none + module m_common + implicit none - integer, parameter :: dp=kind(0.0d0) - real(dp), parameter :: pi = 4*atan(1.0_dp) + integer, parameter :: dp=kind(0.0d0) + real(dp), parameter :: pi = 4*atan(1.0_dp) - integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & - RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, & - RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, & - RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34 - integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4 - integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 + integer, parameter :: RDR_X2Y = 12, RDR_X2Z = 13, RDR_Y2X = 21, & + RDR_Y2Z = 23, RDR_Z2X = 31, RDR_Z2Y = 32, & + RDR_C2X = 41, RDR_C2Y = 42, RDR_C2Z = 43, & + RDR_X2C = 14, RDR_Y2C = 24, RDR_Z2C = 34 + integer, parameter :: DIR_X = 1, DIR_Y = 2, DIR_Z = 3, DIR_C = 4 + integer, parameter :: POISSON_SOLVER_FFT = 0, POISSON_SOLVER_CG = 1 - integer, protected :: & + integer, protected :: & rdr_map(4, 4) = reshape([0, RDR_X2Y, RDR_X2Z, RDR_X2C, & - RDR_Y2X, 0, RDR_Y2Z, RDR_Y2C, & - RDR_Z2X, RDR_Z2Y, 0, RDR_Z2C, & - RDR_C2X, RDR_C2Y, RDR_C2Z, 0], shape=[4, 4]) + RDR_Y2X, 0, RDR_Y2Z, RDR_Y2C, & + RDR_Z2X, RDR_Z2Y, 0, RDR_Z2C, & + RDR_C2X, RDR_C2Y, RDR_C2Z, 0], shape=[4, 4]) - type :: globs_t + type :: globs_t integer :: nx, ny, nz integer :: nx_loc, ny_loc, nz_loc integer :: n_groups_x, n_groups_y, n_groups_z @@ -28,12 +28,12 @@ module m_common integer :: nproc_x = 1, nproc_y = 1, nproc_z = 1 character(len=20) :: BC_x_s, BC_x_e, BC_y_s, BC_y_e, BC_z_s, BC_z_e integer :: poisson_solver_type - end type globs_t + end type globs_t -contains + contains - subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, & - xnproc, ynproc, znproc, nrank) + subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, & + xnproc, ynproc, znproc, nrank) implicit none integer, intent(out) :: xprev, xnext, yprev, ynext, zprev, znext @@ -48,50 +48,50 @@ subroutine set_pprev_pnext(xprev, xnext, yprev, ynext, zprev, znext, & ! prev and next in x direction if (ix == 0) then - xprev = nrank + (xnproc - 1) + xprev = nrank + (xnproc - 1) else - xprev = nrank - 1 + xprev = nrank - 1 end if if (ix == xnproc - 1) then - xnext = nrank - (xnproc - 1) + xnext = nrank - (xnproc - 1) else - xnext = nrank + 1 + xnext = nrank + 1 end if ! prev and next in y direction if (iy == 0) then - yprev = nrank + (xnproc*(ynproc - 1)) + yprev = nrank + (xnproc*(ynproc - 1)) else - yprev = nrank - xnproc + yprev = nrank - xnproc end if if (iy == ynproc - 1) then - ynext = nrank - (xnproc*(ynproc - 1)) + ynext = nrank - (xnproc*(ynproc - 1)) else - ynext = nrank + xnproc + ynext = nrank + xnproc end if ! prev and next in z direction if (iz == 0) then - zprev = nrank + (xnproc*ynproc*(znproc - 1)) + zprev = nrank + (xnproc*ynproc*(znproc - 1)) else - zprev = nrank - xnproc*ynproc + zprev = nrank - xnproc*ynproc end if if (iz == znproc - 1) then - znext = nrank - (xnproc*ynproc*(znproc - 1)) + znext = nrank - (xnproc*ynproc*(znproc - 1)) else - znext = nrank + xnproc*ynproc + znext = nrank + xnproc*ynproc end if - end subroutine set_pprev_pnext + end subroutine set_pprev_pnext - integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir) + integer function get_rdr_from_dirs(dir_from, dir_to) result(rdr_dir) !! Returns RDR_?2? value based on two direction inputs integer, intent(in) :: dir_from, dir_to rdr_dir = rdr_map(dir_from, dir_to) - end function get_rdr_from_dirs + end function get_rdr_from_dirs -end module m_common + end module m_common diff --git a/src/cuda/allocator.f90 b/src/cuda/allocator.f90 index 81a817f9..b3c876b9 100644 --- a/src/cuda/allocator.f90 +++ b/src/cuda/allocator.f90 @@ -1,32 +1,32 @@ -module m_cuda_allocator - use m_allocator, only: allocator_t, field_t - use m_common, only: dp + module m_cuda_allocator + use m_allocator, only: allocator_t, field_t + use m_common, only: dp - implicit none + implicit none - type, extends(allocator_t) :: cuda_allocator_t - contains + type, extends(allocator_t) :: cuda_allocator_t + contains procedure :: create_block => create_cuda_block - end type cuda_allocator_t + end type cuda_allocator_t - interface cuda_allocator_t + interface cuda_allocator_t module procedure cuda_allocator_init - end interface cuda_allocator_t + end interface cuda_allocator_t - type, extends(field_t) :: cuda_field_t + type, extends(field_t) :: cuda_field_t real(dp), device, pointer, private :: p_data_d(:) real(dp), device, pointer, contiguous :: data_d(:, :, :) - contains + contains procedure :: set_shape => set_shape_cuda - end type cuda_field_t + end type cuda_field_t - interface cuda_field_t + interface cuda_field_t module procedure cuda_field_init - end interface cuda_field_t + end interface cuda_field_t -contains + contains - function cuda_field_init(ngrid, next, id) result(f) + function cuda_field_init(ngrid, next, id) result(f) integer, intent(in) :: ngrid, id type(cuda_field_t), pointer, intent(in) :: next type(cuda_field_t) :: f @@ -35,9 +35,9 @@ function cuda_field_init(ngrid, next, id) result(f) f%refcount = 0 f%next => next f%id = id - end function cuda_field_init + end function cuda_field_init - subroutine set_shape_cuda(self, dims) + subroutine set_shape_cuda(self, dims) implicit none class(cuda_field_t) :: self @@ -45,16 +45,16 @@ subroutine set_shape_cuda(self, dims) self%data_d(1:dims(1), 1:dims(2), 1:dims(3)) => self%p_data_d - end subroutine set_shape_cuda + end subroutine set_shape_cuda - function cuda_allocator_init(nx, ny, nz, sz) result(allocator) + function cuda_allocator_init(nx, ny, nz, sz) result(allocator) integer, intent(in) :: nx, ny, nz, sz type(cuda_allocator_t) :: allocator allocator%allocator_t = allocator_t(nx, ny, nz, sz) - end function cuda_allocator_init + end function cuda_allocator_init - function create_cuda_block(self, next) result(ptr) + function create_cuda_block(self, next) result(ptr) class(cuda_allocator_t), intent(inout) :: self type(cuda_field_t), pointer, intent(in) :: next type(cuda_field_t), pointer :: newblock @@ -63,6 +63,6 @@ function create_cuda_block(self, next) result(ptr) self%next_id = self%next_id + 1 newblock = cuda_field_t(self%ngrid, next, id=self%next_id) ptr => newblock - end function create_cuda_block + end function create_cuda_block -end module m_cuda_allocator + end module m_cuda_allocator diff --git a/src/cuda/backend.f90 b/src/cuda/backend.f90 index ff7f6dad..a2360fd5 100644 --- a/src/cuda/backend.f90 +++ b/src/cuda/backend.f90 @@ -1,43 +1,43 @@ -module m_cuda_backend - use iso_fortran_env, only: stderr => error_unit - use cudafor - use mpi - - use m_allocator, only: allocator_t, field_t - use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & - RDR_C2X, RDR_C2Y, RDR_C2Z, RDR_X2C, RDR_Y2C, RDR_Z2C, & - DIR_X, DIR_Y, DIR_Z, DIR_C - use m_poisson_fft, only: poisson_fft_t - use m_tdsops, only: dirps_t, tdsops_t - - use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t - use m_cuda_common, only: SZ - use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact - use m_cuda_poisson_fft, only: cuda_poisson_fft_t - use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields - use m_cuda_tdsops, only: cuda_tdsops_t - use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs - use m_cuda_kernels_reorder, only: & - reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, & - reorder_z2y, reorder_c2x, reorder_x2c, & - sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy - - implicit none - - private :: transeq_halo_exchange, transeq_dist_component - - type, extends(base_backend_t) :: cuda_backend_t + module m_cuda_backend + use iso_fortran_env, only: stderr => error_unit + use cudafor + use mpi + + use m_allocator, only: allocator_t, field_t + use m_base_backend, only: base_backend_t + use m_common, only: dp, globs_t, & + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + RDR_C2X, RDR_C2Y, RDR_C2Z, RDR_X2C, RDR_Y2C, RDR_Z2C, & + DIR_X, DIR_Y, DIR_Z, DIR_C + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t, tdsops_t + + use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t + use m_cuda_common, only: SZ + use m_cuda_exec_dist, only: exec_dist_transeq_3fused, exec_dist_tds_compact + use m_cuda_poisson_fft, only: cuda_poisson_fft_t + use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields + use m_cuda_tdsops, only: cuda_tdsops_t + use m_cuda_kernels_dist, only: transeq_3fused_dist, transeq_3fused_subs + use m_cuda_kernels_reorder, only: & + reorder_x2y, reorder_x2z, reorder_y2x, reorder_y2z, reorder_z2x, & + reorder_z2y, reorder_c2x, reorder_x2c, & + sum_yintox, sum_zintox, scalar_product, axpby, buffer_copy + + implicit none + + private :: transeq_halo_exchange, transeq_dist_component + + type, extends(base_backend_t) :: cuda_backend_t !character(len=*), parameter :: name = 'cuda' integer :: MPI_FP_PREC = dp real(dp), device, allocatable, dimension(:, :, :) :: & - u_recv_s_dev, u_recv_e_dev, u_send_s_dev, u_send_e_dev, & - v_recv_s_dev, v_recv_e_dev, v_send_s_dev, v_send_e_dev, & - w_recv_s_dev, w_recv_e_dev, w_send_s_dev, w_send_e_dev, & - du_send_s_dev, du_send_e_dev, du_recv_s_dev, du_recv_e_dev, & - dud_send_s_dev, dud_send_e_dev, dud_recv_s_dev, dud_recv_e_dev, & - d2u_send_s_dev, d2u_send_e_dev, d2u_recv_s_dev, d2u_recv_e_dev + u_recv_s_dev, u_recv_e_dev, u_send_s_dev, u_send_e_dev, & + v_recv_s_dev, v_recv_e_dev, v_send_s_dev, v_send_e_dev, & + w_recv_s_dev, w_recv_e_dev, w_send_s_dev, w_send_e_dev, & + du_send_s_dev, du_send_e_dev, du_recv_s_dev, du_recv_e_dev, & + dud_send_s_dev, dud_send_e_dev, dud_recv_s_dev, dud_recv_e_dev, & + d2u_send_s_dev, d2u_send_e_dev, d2u_recv_s_dev, d2u_recv_e_dev type(dim3) :: xblocks, xthreads, yblocks, ythreads, zblocks, zthreads contains procedure :: alloc_tdsops => alloc_cuda_tdsops @@ -56,15 +56,15 @@ module m_cuda_backend procedure :: transeq_cuda_dist procedure :: transeq_cuda_thom procedure :: tds_solve_dist - end type cuda_backend_t + end type cuda_backend_t - interface cuda_backend_t + interface cuda_backend_t module procedure init - end interface cuda_backend_t + end interface cuda_backend_t - contains + contains - function init(globs, allocator) result(backend) + function init(globs, allocator) result(backend) implicit none class(globs_t) :: globs @@ -75,9 +75,9 @@ function init(globs, allocator) result(backend) integer :: n_halo, n_block select type(allocator) - type is (cuda_allocator_t) - ! class level access to the allocator - backend%allocator => allocator + type is (cuda_allocator_t) + ! class level access to the allocator + backend%allocator => allocator end select backend%xthreads = dim3(SZ, 1, 1) @@ -121,9 +121,9 @@ function init(globs, allocator) result(backend) allocate(backend%d2u_recv_s_dev(SZ, 1, n_block)) allocate(backend%d2u_recv_e_dev(SZ, 1, n_block)) - end function init + end function init - subroutine alloc_cuda_tdsops( & + subroutine alloc_cuda_tdsops( & self, tdsops, n, dx, operation, scheme, & n_halo, from_to, bc_start, bc_end, sym, c_nu, nu0_nu & ) @@ -142,14 +142,14 @@ subroutine alloc_cuda_tdsops( & allocate(cuda_tdsops_t :: tdsops) select type (tdsops) - type is (cuda_tdsops_t) - tdsops = cuda_tdsops_t(n, dx, operation, scheme, n_halo, from_to, & - bc_start, bc_end, sym, c_nu, nu0_nu) + type is (cuda_tdsops_t) + tdsops = cuda_tdsops_t(n, dx, operation, scheme, n_halo, from_to, & + bc_start, bc_end, sym, c_nu, nu0_nu) end select - end subroutine alloc_cuda_tdsops + end subroutine alloc_cuda_tdsops - subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, dirps) implicit none class(cuda_backend_t) :: self @@ -158,11 +158,11 @@ subroutine transeq_x_cuda(self, du, dv, dw, u, v, w, dirps) type(dirps_t), intent(in) :: dirps call self%transeq_cuda_dist(du, dv, dw, u, v, w, dirps, & - self%xblocks, self%xthreads) + self%xblocks, self%xthreads) - end subroutine transeq_x_cuda + end subroutine transeq_x_cuda - subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, dirps) implicit none class(cuda_backend_t) :: self @@ -172,11 +172,11 @@ subroutine transeq_y_cuda(self, du, dv, dw, u, v, w, dirps) ! u, v, w is reordered so that we pass v, u, w call self%transeq_cuda_dist(dv, du, dw, v, u, w, dirps, & - self%yblocks, self%ythreads) + self%yblocks, self%ythreads) - end subroutine transeq_y_cuda + end subroutine transeq_y_cuda - subroutine transeq_z_cuda(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_z_cuda(self, du, dv, dw, u, v, w, dirps) implicit none class(cuda_backend_t) :: self @@ -186,12 +186,12 @@ subroutine transeq_z_cuda(self, du, dv, dw, u, v, w, dirps) ! u, v, w is reordered so that we pass w, u, v call self%transeq_cuda_dist(dw, du, dv, w, u, v, dirps, & - self%zblocks, self%zthreads) + self%zblocks, self%zthreads) - end subroutine transeq_z_cuda + end subroutine transeq_z_cuda - subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, dirps, & - blocks, threads) + subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, dirps, & + blocks, threads) implicit none class(cuda_backend_t) :: self @@ -201,7 +201,7 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, dirps, & type(dim3), intent(in) :: blocks, threads real(dp), device, pointer, dimension(:, :, :) :: u_dev, v_dev, w_dev, & - du_dev, dv_dev, dw_dev + du_dev, dv_dev, dw_dev type(cuda_tdsops_t), pointer :: der1st, der1st_sym, der2nd, der2nd_sym @@ -214,39 +214,39 @@ subroutine transeq_cuda_dist(self, du, dv, dw, u, v, w, dirps, & call resolve_field_t(dw_dev, dw) select type (tdsops => dirps%der1st) - type is (cuda_tdsops_t); der1st => tdsops + type is (cuda_tdsops_t); der1st => tdsops end select select type (tdsops => dirps%der1st_sym) - type is (cuda_tdsops_t); der1st_sym => tdsops + type is (cuda_tdsops_t); der1st_sym => tdsops end select select type (tdsops => dirps%der2nd) - type is (cuda_tdsops_t); der2nd => tdsops + type is (cuda_tdsops_t); der2nd => tdsops end select select type (tdsops => dirps%der2nd_sym) - type is (cuda_tdsops_t); der2nd_sym => tdsops + type is (cuda_tdsops_t); der2nd_sym => tdsops end select call transeq_halo_exchange(self, u_dev, v_dev, w_dev, dirps) call transeq_dist_component(self, du_dev, u_dev, u_dev, & - self%u_recv_s_dev, self%u_recv_e_dev, & - self%u_recv_s_dev, self%u_recv_e_dev, & - der1st, der1st_sym, der2nd, dirps, & - blocks, threads) + self%u_recv_s_dev, self%u_recv_e_dev, & + self%u_recv_s_dev, self%u_recv_e_dev, & + der1st, der1st_sym, der2nd, dirps, & + blocks, threads) call transeq_dist_component(self, dv_dev, v_dev, u_dev, & - self%v_recv_s_dev, self%v_recv_e_dev, & - self%u_recv_s_dev, self%u_recv_e_dev, & - der1st_sym, der1st, der2nd_sym, dirps, & - blocks, threads) + self%v_recv_s_dev, self%v_recv_e_dev, & + self%u_recv_s_dev, self%u_recv_e_dev, & + der1st_sym, der1st, der2nd_sym, dirps, & + blocks, threads) call transeq_dist_component(self, dw_dev, w_dev, u_dev, & - self%w_recv_s_dev, self%w_recv_e_dev, & - self%u_recv_s_dev, self%u_recv_e_dev, & - der1st_sym, der1st, der2nd_sym, dirps, & - blocks, threads) + self%w_recv_s_dev, self%w_recv_e_dev, & + self%u_recv_s_dev, self%u_recv_e_dev, & + der1st_sym, der1st, der2nd_sym, dirps, & + blocks, threads) - end subroutine transeq_cuda_dist + end subroutine transeq_cuda_dist - subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dirps) + subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dirps) class(cuda_backend_t) :: self real(dp), device, dimension(:, :, :), intent(in) :: u_dev, v_dev, w_dev type(dirps_t), intent(in) :: dirps @@ -257,30 +257,30 @@ subroutine transeq_halo_exchange(self, u_dev, v_dev, w_dev, dirps) ! Copy halo data into buffer arrays call copy_into_buffers(self%u_send_s_dev, self%u_send_e_dev, u_dev, & - dirps%n) + dirps%n) call copy_into_buffers(self%v_send_s_dev, self%v_send_e_dev, v_dev, & - dirps%n) + dirps%n) call copy_into_buffers(self%w_send_s_dev, self%w_send_e_dev, w_dev, & - dirps%n) + dirps%n) ! halo exchange call sendrecv_3fields( & - self%u_recv_s_dev, self%u_recv_e_dev, & - self%v_recv_s_dev, self%v_recv_e_dev, & - self%w_recv_s_dev, self%w_recv_e_dev, & - self%u_send_s_dev, self%u_send_e_dev, & - self%v_send_s_dev, self%v_send_e_dev, & - self%w_send_s_dev, self%w_send_e_dev, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext & - ) - - end subroutine transeq_halo_exchange - - subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, & - u_recv_s_dev, u_recv_e_dev, & - conv_recv_s_dev, conv_recv_e_dev, & - tdsops_du, tdsops_dud, tdsops_d2u, & - dirps, blocks, threads) + self%u_recv_s_dev, self%u_recv_e_dev, & + self%v_recv_s_dev, self%v_recv_e_dev, & + self%w_recv_s_dev, self%w_recv_e_dev, & + self%u_send_s_dev, self%u_send_e_dev, & + self%v_send_s_dev, self%v_send_e_dev, & + self%w_send_s_dev, self%w_send_e_dev, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext & + ) + + end subroutine transeq_halo_exchange + + subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, & + u_recv_s_dev, u_recv_e_dev, & + conv_recv_s_dev, conv_recv_e_dev, & + tdsops_du, tdsops_dud, tdsops_d2u, & + dirps, blocks, threads) !! Computes RHS_x^u following: !! !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2 @@ -288,8 +288,8 @@ subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, & real(dp), device, dimension(:, :, :), intent(inout) :: rhs_dev real(dp), device, dimension(:, :, :), intent(in) :: u_dev, conv_dev real(dp), device, dimension(:, :, :), intent(in) :: & - u_recv_s_dev, u_recv_e_dev, & - conv_recv_s_dev, conv_recv_e_dev + u_recv_s_dev, u_recv_e_dev, & + conv_recv_s_dev, conv_recv_e_dev class(cuda_tdsops_t), intent(in) :: tdsops_du, tdsops_dud, tdsops_d2u type(dirps_t), intent(in) :: dirps type(dim3), intent(in) :: blocks, threads @@ -297,7 +297,7 @@ subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, & class(field_t), pointer :: du, dud, d2u real(dp), device, pointer, dimension(:, :, :) :: & - du_dev, dud_dev, d2u_dev + du_dev, dud_dev, d2u_dev ! Get some fields for storing the intermediate results du => self%allocator%get_block(dirps%dir) @@ -309,29 +309,29 @@ subroutine transeq_dist_component(self, rhs_dev, u_dev, conv_dev, & call resolve_field_t(d2u_dev, d2u) call exec_dist_transeq_3fused( & - rhs_dev, & - u_dev, u_recv_s_dev, u_recv_e_dev, & - conv_dev, conv_recv_s_dev, conv_recv_e_dev, & - du_dev, dud_dev, d2u_dev, & - self%du_send_s_dev, self%du_send_e_dev, & - self%du_recv_s_dev, self%du_recv_e_dev, & - self%dud_send_s_dev, self%dud_send_e_dev, & - self%dud_recv_s_dev, self%dud_recv_e_dev, & - self%d2u_send_s_dev, self%d2u_send_e_dev, & - self%d2u_recv_s_dev, self%d2u_recv_e_dev, & - tdsops_du, tdsops_d2u, self%nu, & - dirps%nproc, dirps%pprev, dirps%pnext, & - blocks, threads & - ) + rhs_dev, & + u_dev, u_recv_s_dev, u_recv_e_dev, & + conv_dev, conv_recv_s_dev, conv_recv_e_dev, & + du_dev, dud_dev, d2u_dev, & + self%du_send_s_dev, self%du_send_e_dev, & + self%du_recv_s_dev, self%du_recv_e_dev, & + self%dud_send_s_dev, self%dud_send_e_dev, & + self%dud_recv_s_dev, self%dud_recv_e_dev, & + self%d2u_send_s_dev, self%d2u_send_e_dev, & + self%d2u_recv_s_dev, self%d2u_recv_e_dev, & + tdsops_du, tdsops_d2u, self%nu, & + dirps%nproc, dirps%pprev, dirps%pnext, & + blocks, threads & + ) ! Release temporary blocks call self%allocator%release_block(du) call self%allocator%release_block(dud) call self%allocator%release_block(d2u) - end subroutine transeq_dist_component + end subroutine transeq_dist_component - subroutine transeq_cuda_thom(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_cuda_thom(self, du, dv, dw, u, v, w, dirps) !! Thomas algorithm implementation. So much more easier than the !! distributed algorithm. It is intended to work only on a single rank !! so there is no MPI communication. @@ -342,9 +342,9 @@ subroutine transeq_cuda_thom(self, du, dv, dw, u, v, w, dirps) class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps - end subroutine transeq_cuda_thom + end subroutine transeq_cuda_thom - subroutine tds_solve_cuda(self, du, u, dirps, tdsops) + subroutine tds_solve_cuda(self, du, u, dirps, tdsops) implicit none class(cuda_backend_t) :: self @@ -357,16 +357,16 @@ subroutine tds_solve_cuda(self, du, u, dirps, tdsops) ! Check if direction matches for both in/out fields and dirps if (dirps%dir /= du%dir .or. u%dir /= du%dir) then - error stop 'DIR mismatch between fields and dirps in tds_solve.' + error stop 'DIR mismatch between fields and dirps in tds_solve.' end if blocks = dim3(dirps%n_blocks, 1, 1); threads = dim3(SZ, 1, 1) call tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) - end subroutine tds_solve_cuda + end subroutine tds_solve_cuda - subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) + subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) implicit none class(cuda_backend_t) :: self @@ -389,30 +389,30 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops, blocks, threads) call resolve_field_t(u_dev, u) select type (tdsops) - type is (cuda_tdsops_t); tdsops_dev => tdsops + type is (cuda_tdsops_t); tdsops_dev => tdsops end select call copy_into_buffers(self%u_send_s_dev, self%u_send_e_dev, u_dev, & - tdsops_dev%n) + tdsops_dev%n) call sendrecv_fields(self%u_recv_s_dev, self%u_recv_e_dev, & - self%u_send_s_dev, self%u_send_e_dev, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, & - dirps%pprev, dirps%pnext) + self%u_send_s_dev, self%u_send_e_dev, & + SZ*n_halo*dirps%n_blocks, dirps%nproc, & + dirps%pprev, dirps%pnext) ! call exec_dist call exec_dist_tds_compact( & - du_dev, u_dev, & - self%u_recv_s_dev, self%u_recv_e_dev, & - self%du_send_s_dev, self%du_send_e_dev, & - self%du_recv_s_dev, self%du_recv_e_dev, & - tdsops_dev, dirps%nproc, dirps%pprev, dirps%pnext, & - blocks, threads & - ) + du_dev, u_dev, & + self%u_recv_s_dev, self%u_recv_e_dev, & + self%du_send_s_dev, self%du_send_e_dev, & + self%du_recv_s_dev, self%du_recv_e_dev, & + tdsops_dev, dirps%nproc, dirps%pprev, dirps%pnext, & + blocks, threads & + ) - end subroutine tds_solve_dist + end subroutine tds_solve_dist - subroutine reorder_cuda(self, u_o, u_i, direction) + subroutine reorder_cuda(self, u_o, u_i, direction) implicit none class(cuda_backend_t) :: self @@ -428,101 +428,101 @@ subroutine reorder_cuda(self, u_o, u_i, direction) call resolve_field_t(u_i_d, u_i) select case (direction) - case (RDR_X2Y) - blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) - threads = dim3(SZ, SZ, 1) - call reorder_x2y<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_X2Z) - blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) - threads = dim3(SZ, 1, 1) - call reorder_x2z<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_Y2X) - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_y2x<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_Y2Z) - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_y2z<<>>(u_o_d, u_i_d, & - self%nx_loc, self%nz_loc) - case (RDR_Z2X) - blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) - threads = dim3(SZ, 1, 1) - call reorder_z2x<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_Z2Y) - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_z2y<<>>(u_o_d, u_i_d, & - self%nx_loc, self%nz_loc) - case (RDR_C2X) - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_c2x<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_C2Y) - ! First reorder from C to X, then from X to Y - u_temp => self%allocator%get_block(DIR_X) - call resolve_field_t(u_temp_d, u_temp) - - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_c2x<<>>(u_temp_d, u_i_d, self%nz_loc) - - blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) - threads = dim3(SZ, SZ, 1) - call reorder_x2y<<>>(u_o_d, u_temp_d, self%nz_loc) - - call self%allocator%release_block(u_temp) - case (RDR_C2Z) - ! First reorder from C to X, then from X to Z - u_temp => self%allocator%get_block(DIR_X) - call resolve_field_t(u_temp_d, u_temp) - - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_c2x<<>>(u_temp_d, u_i_d, self%nz_loc) - - blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) - threads = dim3(SZ, 1, 1) - call reorder_x2z<<>>(u_o_d, u_temp_d, self%nz_loc) - - call self%allocator%release_block(u_temp) - case (RDR_X2C) - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_x2c<<>>(u_o_d, u_i_d, self%nz_loc) - case (RDR_Y2C) - ! First reorder from Y to X, then from X to C - u_temp => self%allocator%get_block(DIR_X) - call resolve_field_t(u_temp_d, u_temp) - - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_y2x<<>>(u_temp_d, u_i_d, self%nz_loc) - - call reorder_x2c<<>>(u_o_d, u_temp_d, self%nz_loc) - - call self%allocator%release_block(u_temp) - case (RDR_Z2C) - ! First reorder from Z to X, then from X to C - u_temp => self%allocator%get_block(DIR_X) - call resolve_field_t(u_temp_d, u_temp) - - blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) - threads = dim3(SZ, 1, 1) - call reorder_z2x<<>>(u_temp_d, u_i_d, self%nz_loc) - - blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) - threads = dim3(SZ, SZ, 1) - call reorder_x2c<<>>(u_o_d, u_temp_d, self%nz_loc) - - call self%allocator%release_block(u_temp) - case default - error stop 'Reorder direction is undefined.' + case (RDR_X2Y) + blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) + threads = dim3(SZ, SZ, 1) + call reorder_x2y<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_X2Z) + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_x2z<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_Y2X) + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_y2x<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_Y2Z) + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_y2z<<>>(u_o_d, u_i_d, & + self%nx_loc, self%nz_loc) + case (RDR_Z2X) + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_z2x<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_Z2Y) + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_z2y<<>>(u_o_d, u_i_d, & + self%nx_loc, self%nz_loc) + case (RDR_C2X) + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_c2x<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_C2Y) + ! First reorder from C to X, then from X to Y + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_c2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + blocks = dim3(self%nx_loc/SZ, self%nz_loc, self%ny_loc/SZ) + threads = dim3(SZ, SZ, 1) + call reorder_x2y<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) + case (RDR_C2Z) + ! First reorder from C to X, then from X to Z + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_c2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_x2z<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) + case (RDR_X2C) + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_x2c<<>>(u_o_d, u_i_d, self%nz_loc) + case (RDR_Y2C) + ! First reorder from Y to X, then from X to C + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_y2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + call reorder_x2c<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) + case (RDR_Z2C) + ! First reorder from Z to X, then from X to C + u_temp => self%allocator%get_block(DIR_X) + call resolve_field_t(u_temp_d, u_temp) + + blocks = dim3(self%nx_loc, self%ny_loc/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_z2x<<>>(u_temp_d, u_i_d, self%nz_loc) + + blocks = dim3(self%nx_loc/SZ, self%ny_loc/SZ, self%nz_loc) + threads = dim3(SZ, SZ, 1) + call reorder_x2c<<>>(u_o_d, u_temp_d, self%nz_loc) + + call self%allocator%release_block(u_temp) + case default + error stop 'Reorder direction is undefined.' end select - end subroutine reorder_cuda + end subroutine reorder_cuda - subroutine sum_yintox_cuda(self, u, u_y) + subroutine sum_yintox_cuda(self, u, u_y) implicit none class(cuda_backend_t) :: self @@ -539,9 +539,9 @@ subroutine sum_yintox_cuda(self, u, u_y) threads = dim3(SZ, SZ, 1) call sum_yintox<<>>(u_d, u_y_d, self%nz_loc) - end subroutine sum_yintox_cuda + end subroutine sum_yintox_cuda - subroutine sum_zintox_cuda(self, u, u_z) + subroutine sum_zintox_cuda(self, u, u_z) implicit none class(cuda_backend_t) :: self @@ -558,9 +558,9 @@ subroutine sum_zintox_cuda(self, u, u_z) threads = dim3(SZ, 1, 1) call sum_zintox<<>>(u_d, u_z_d, self%nz_loc) - end subroutine sum_zintox_cuda + end subroutine sum_zintox_cuda - subroutine vecadd_cuda(self, a, x, b, y) + subroutine vecadd_cuda(self, a, x, b, y) implicit none class(cuda_backend_t) :: self @@ -581,9 +581,9 @@ subroutine vecadd_cuda(self, a, x, b, y) threads = dim3(SZ, 1, 1) call axpby<<>>(nx, a, x_d, b, y_d) - end subroutine vecadd_cuda + end subroutine vecadd_cuda - real(dp) function scalar_product_cuda(self, x, y) result(s) + real(dp) function scalar_product_cuda(self, x, y) result(s) implicit none class(cuda_backend_t) :: self @@ -608,15 +608,15 @@ real(dp) function scalar_product_cuda(self, x, y) result(s) s = sum_d call MPI_Allreduce(MPI_IN_PLACE, s, 1, MPI_DOUBLE_PRECISION, MPI_SUM, & - MPI_COMM_WORLD, ierr) + MPI_COMM_WORLD, ierr) - end function scalar_product_cuda + end function scalar_product_cuda - subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) + subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) implicit none real(dp), device, dimension(:, :, :), intent(out) :: u_send_s_dev, & - u_send_e_dev + u_send_e_dev real(dp), device, dimension(:, :, :), intent(in) :: u_dev integer, intent(in) :: n @@ -626,27 +626,27 @@ subroutine copy_into_buffers(u_send_s_dev, u_send_e_dev, u_dev, n) blocks = dim3(size(u_dev, dim = 3), 1, 1) threads = dim3(SZ, 1, 1) call buffer_copy<<>>(u_send_s_dev, u_send_e_dev, & - u_dev, n, n_halo) + u_dev, n, n_halo) - end subroutine copy_into_buffers + end subroutine copy_into_buffers - subroutine copy_data_to_f_cuda(self, f, data) + subroutine copy_data_to_f_cuda(self, f, data) class(cuda_backend_t), intent(inout) :: self class(field_t), intent(inout) :: f real(dp), dimension(:, :, :), intent(inout) :: data select type(f); type is (cuda_field_t); f%data_d = data; end select - end subroutine copy_data_to_f_cuda + end subroutine copy_data_to_f_cuda - subroutine copy_f_to_data_cuda(self, data, f) + subroutine copy_f_to_data_cuda(self, data, f) class(cuda_backend_t), intent(inout) :: self real(dp), dimension(:, :, :), intent(out) :: data class(field_t), intent(in) :: f select type(f); type is (cuda_field_t); data = f%data_d; end select - end subroutine copy_f_to_data_cuda + end subroutine copy_f_to_data_cuda - subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps) + subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps) implicit none class(cuda_backend_t) :: self @@ -655,22 +655,22 @@ subroutine init_cuda_poisson_fft(self, xdirps, ydirps, zdirps) allocate(cuda_poisson_fft_t :: self%poisson_fft) select type (poisson_fft => self%poisson_fft) - type is (cuda_poisson_fft_t) - poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps) + type is (cuda_poisson_fft_t) + poisson_fft = cuda_poisson_fft_t(xdirps, ydirps, zdirps) end select - end subroutine init_cuda_poisson_fft + end subroutine init_cuda_poisson_fft - subroutine resolve_field_t(u_dev, u) + subroutine resolve_field_t(u_dev, u) real(dp), device, pointer, dimension(:, :, :), intent(out) :: u_dev class(field_t), intent(in) :: u select type(u) - type is (cuda_field_t) - u_dev => u%data_d + type is (cuda_field_t) + u_dev => u%data_d end select - end subroutine resolve_field_t + end subroutine resolve_field_t -end module m_cuda_backend + end module m_cuda_backend diff --git a/src/cuda/common.f90 b/src/cuda/common.f90 index dc65e886..ac2d66be 100644 --- a/src/cuda/common.f90 +++ b/src/cuda/common.f90 @@ -1,6 +1,6 @@ -module m_cuda_common - implicit none + module m_cuda_common + implicit none - integer, parameter :: SZ=32 + integer, parameter :: SZ=32 -end module m_cuda_common + end module m_cuda_common diff --git a/src/cuda/exec_dist.f90 b/src/cuda/exec_dist.f90 index a5b65b21..51c27c25 100644 --- a/src/cuda/exec_dist.f90 +++ b/src/cuda/exec_dist.f90 @@ -1,22 +1,22 @@ -module m_cuda_exec_dist - use cudafor - use mpi + module m_cuda_exec_dist + use cudafor + use mpi - use m_common, only: dp - use m_cuda_common, only: SZ - use m_cuda_kernels_dist, only: der_univ_dist, der_univ_subs, & - transeq_3fused_dist, transeq_3fused_subs - use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields - use m_cuda_tdsops, only: cuda_tdsops_t + use m_common, only: dp + use m_cuda_common, only: SZ + use m_cuda_kernels_dist, only: der_univ_dist, der_univ_subs, & + transeq_3fused_dist, transeq_3fused_subs + use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields + use m_cuda_tdsops, only: cuda_tdsops_t - implicit none + implicit none -contains + contains - subroutine exec_dist_tds_compact( & + subroutine exec_dist_tds_compact( & du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, & tdsops, nproc, pprev, pnext, blocks, threads & - ) + ) implicit none ! du = d(u) @@ -27,7 +27,7 @@ subroutine exec_dist_tds_compact( & ! not because we actually need the data they store later where this ! subroutine is called. We absolutely don't care the data they pass back real(dp), device, dimension(:, :, :), intent(out) :: & - du_send_s, du_send_e, du_recv_s, du_recv_e + du_send_s, du_send_e, du_recv_s, du_recv_e type(cuda_tdsops_t), intent(in) :: tdsops integer, intent(in) :: nproc, pprev, pnext @@ -38,30 +38,30 @@ subroutine exec_dist_tds_compact( & n_data = SZ*1*blocks%x call der_univ_dist<<>>( & - du, du_send_s, du_send_e, u, u_recv_s, u_recv_e, & - tdsops%coeffs_s_dev, tdsops%coeffs_e_dev, tdsops%coeffs_dev, & - tdsops%n, tdsops%dist_fw_dev, tdsops%dist_bw_dev, tdsops%dist_af_dev & - ) + du, du_send_s, du_send_e, u, u_recv_s, u_recv_e, & + tdsops%coeffs_s_dev, tdsops%coeffs_e_dev, tdsops%coeffs_dev, & + tdsops%n, tdsops%dist_fw_dev, tdsops%dist_bw_dev, tdsops%dist_af_dev & + ) ! halo exchange for 2x2 systems call sendrecv_fields(du_recv_s, du_recv_e, du_send_s, du_send_e, & - n_data, nproc, pprev, pnext) + n_data, nproc, pprev, pnext) call der_univ_subs<<>>( & - du, du_recv_s, du_recv_e, & - tdsops%n, tdsops%dist_sa_dev, tdsops%dist_sc_dev & - ) + du, du_recv_s, du_recv_e, & + tdsops%n, tdsops%dist_sa_dev, tdsops%dist_sc_dev & + ) - end subroutine exec_dist_tds_compact + end subroutine exec_dist_tds_compact - subroutine exec_dist_transeq_3fused( & + subroutine exec_dist_transeq_3fused( & r_u, u, u_recv_s, u_recv_e, v, v_recv_s, v_recv_e, & du, dud, d2u, & du_send_s, du_send_e, du_recv_s, du_recv_e, & dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e, & der1st, der2nd, nu, nproc, pprev, pnext, blocks, threads & - ) + ) implicit none ! r_u = -1/2*(v*d1(u) + d1(u*v)) + nu*d2(u) @@ -74,9 +74,9 @@ subroutine exec_dist_transeq_3fused( & ! subroutine is called. We absolutely don't care the data they pass back real(dp), device, dimension(:, :, :), intent(out) :: du, dud, d2u real(dp), device, dimension(:, :, :), intent(out) :: & - du_send_s, du_send_e, du_recv_s, du_recv_e, & - dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & - d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e + du_send_s, du_send_e, du_recv_s, du_recv_e, & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e type(cuda_tdsops_t), intent(in) :: der1st, der2nd real(dp), intent(in) :: nu @@ -88,37 +88,37 @@ subroutine exec_dist_transeq_3fused( & n_data = SZ*1*blocks%x call transeq_3fused_dist<<>>( & - du, dud, d2u, & - du_send_s, du_send_e, & - dud_send_s, dud_send_e, & - d2u_send_s, d2u_send_e, & - u, u_recv_s, u_recv_e, & - v, v_recv_s, v_recv_e, der1st%n, & - der1st%coeffs_s_dev, der1st%coeffs_e_dev, der1st%coeffs_dev, & - der1st%dist_fw_dev, der1st%dist_bw_dev, der1st%dist_af_dev, & - der2nd%coeffs_s_dev, der2nd%coeffs_e_dev, der2nd%coeffs_dev, & - der2nd%dist_fw_dev, der2nd%dist_bw_dev, der2nd%dist_af_dev & - ) + du, dud, d2u, & + du_send_s, du_send_e, & + dud_send_s, dud_send_e, & + d2u_send_s, d2u_send_e, & + u, u_recv_s, u_recv_e, & + v, v_recv_s, v_recv_e, der1st%n, & + der1st%coeffs_s_dev, der1st%coeffs_e_dev, der1st%coeffs_dev, & + der1st%dist_fw_dev, der1st%dist_bw_dev, der1st%dist_af_dev, & + der2nd%coeffs_s_dev, der2nd%coeffs_e_dev, der2nd%coeffs_dev, & + der2nd%dist_fw_dev, der2nd%dist_bw_dev, der2nd%dist_af_dev & + ) ! halo exchange for 2x2 systems call sendrecv_3fields( & - du_recv_s, du_recv_e, dud_recv_s, dud_recv_e, & - d2u_recv_s, d2u_recv_e, & - du_send_s, du_send_e, dud_send_s, dud_send_e, & - d2u_send_s, d2u_send_e, & - n_data, nproc, pprev, pnext & - ) + du_recv_s, du_recv_e, dud_recv_s, dud_recv_e, & + d2u_recv_s, d2u_recv_e, & + du_send_s, du_send_e, dud_send_s, dud_send_e, & + d2u_send_s, d2u_send_e, & + n_data, nproc, pprev, pnext & + ) call transeq_3fused_subs<<>>( & - r_u, v, du, dud, d2u, & - du_recv_s, du_recv_e, & - dud_recv_s, dud_recv_e, & - d2u_recv_s, d2u_recv_e, & - der1st%dist_sa_dev, der1st%dist_sc_dev, & - der2nd%dist_sa_dev, der2nd%dist_sc_dev, & - der1st%n, nu & - ) - - end subroutine exec_dist_transeq_3fused - -end module m_cuda_exec_dist + r_u, v, du, dud, d2u, & + du_recv_s, du_recv_e, & + dud_recv_s, dud_recv_e, & + d2u_recv_s, d2u_recv_e, & + der1st%dist_sa_dev, der1st%dist_sc_dev, & + der2nd%dist_sa_dev, der2nd%dist_sc_dev, & + der1st%n, nu & + ) + + end subroutine exec_dist_transeq_3fused + + end module m_cuda_exec_dist diff --git a/src/cuda/kernels/complex.f90 b/src/cuda/kernels/complex.f90 index 32c5041f..d6b2787b 100644 --- a/src/cuda/kernels/complex.f90 +++ b/src/cuda/kernels/complex.f90 @@ -1,295 +1,295 @@ -module m_cuda_complex - use cudafor - - use m_common, only: dp - use m_cuda_common, only: SZ - - implicit none - -contains - - attributes(global) subroutine process_spectral_div_u( & - div, waves, nx, ny, nz, ax, bx, ay, by, az, bz & - ) - implicit none - - ! Arguments - complex(dp), device, intent(inout), dimension(:, :, :) :: div - complex(dp), device, intent(in), dimension(:, :, :) :: waves - real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz - integer, value, intent(in) :: nx, ny, nz - - ! Local variables - integer :: i, j, b, ix, iy, iz - real(dp) :: tmp_r, tmp_c, div_r, div_c - - i = threadIdx%x - b = blockIdx%x - - do j = 1, nx - ! normalisation - div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz) - div_c = aimag(div(i, j, b))/(nx*ny*nz) - - ! get the indices for x, y, z directions - ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1 - - ! post-process forward - ! post-process in z - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bz(iz) + tmp_c*az(iz) - div_c = tmp_c*bz(iz) - tmp_r*az(iz) + module m_cuda_complex + use cudafor + + use m_common, only: dp + use m_cuda_common, only: SZ + + implicit none + + contains + + attributes(global) subroutine process_spectral_div_u( & + div, waves, nx, ny, nz, ax, bx, ay, by, az, bz & + ) + implicit none + + ! Arguments + complex(dp), device, intent(inout), dimension(:, :, :) :: div + complex(dp), device, intent(in), dimension(:, :, :) :: waves + real(dp), device, intent(in), dimension(:) :: ax, bx, ay, by, az, bz + integer, value, intent(in) :: nx, ny, nz + + ! Local variables + integer :: i, j, b, ix, iy, iz + real(dp) :: tmp_r, tmp_c, div_r, div_c + + i = threadIdx%x + b = blockIdx%x + + do j = 1, nx + ! normalisation + div_r = real(div(i, j, b), kind=dp)/(nx*ny*nz) + div_c = aimag(div(i, j, b))/(nx*ny*nz) + + ! get the indices for x, y, z directions + ix = j; iy = i + (b - 1)/(nz/2 + 1)*SZ; iz = mod(b - 1, nz/2 + 1) + 1 + + ! post-process forward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) + tmp_c*az(iz) + div_c = tmp_c*bz(iz) - tmp_r*az(iz) - ! post-process in y - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*by(iy) + tmp_c*ay(iy) - div_c = tmp_c*by(iy) - tmp_r*ay(iy) - if (iy > ny/2 + 1) div_r = -div_r - if (iy > ny/2 + 1) div_c = -div_c + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c - ! post-process in x - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bx(ix) + tmp_c*ax(ix) - div_c = tmp_c*bx(ix) - tmp_r*ax(ix) - if (ix > nx/2 + 1) div_r = -div_r - if (ix > nx/2 + 1) div_c = -div_c + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c = tmp_c*bx(ix) - tmp_r*ax(ix) + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c - ! Solve Poisson - tmp_r = real(waves(i, j, b), kind=dp) - tmp_c = aimag(waves(i, j, b)) - if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then - div_r = 0._dp; div_c = 0._dp - else - div_r = -div_r/tmp_r - div_c = -div_c/tmp_c - end if + ! Solve Poisson + tmp_r = real(waves(i, j, b), kind=dp) + tmp_c = aimag(waves(i, j, b)) + if ((tmp_r < 1.e-16_dp) .or. (tmp_c < 1.e-16_dp)) then + div_r = 0._dp; div_c = 0._dp + else + div_r = -div_r/tmp_r + div_c = -div_c/tmp_c + end if - ! post-process backward - ! post-process in z - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bz(iz) - tmp_c*az(iz) - div_c = -tmp_c*bz(iz) - tmp_r*az(iz) + ! post-process backward + ! post-process in z + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bz(iz) - tmp_c*az(iz) + div_c = -tmp_c*bz(iz) - tmp_r*az(iz) - ! post-process in y - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*by(iy) + tmp_c*ay(iy) - div_c = tmp_c*by(iy) - tmp_r*ay(iy) - if (iy > ny/2 + 1) div_r = -div_r - if (iy > ny/2 + 1) div_c = -div_c + ! post-process in y + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*by(iy) + tmp_c*ay(iy) + div_c = tmp_c*by(iy) - tmp_r*ay(iy) + if (iy > ny/2 + 1) div_r = -div_r + if (iy > ny/2 + 1) div_c = -div_c - ! post-process in x - tmp_r = div_r - tmp_c = div_c - div_r = tmp_r*bx(ix) + tmp_c*ax(ix) - div_c = -tmp_c*bx(ix) + tmp_r*ax(ix) - if (ix > nx/2 + 1) div_r = -div_r - if (ix > nx/2 + 1) div_c = -div_c + ! post-process in x + tmp_r = div_r + tmp_c = div_c + div_r = tmp_r*bx(ix) + tmp_c*ax(ix) + div_c = -tmp_c*bx(ix) + tmp_r*ax(ix) + if (ix > nx/2 + 1) div_r = -div_r + if (ix > nx/2 + 1) div_c = -div_c - ! update the entry - div(i, j, b) = cmplx(div_r, div_c, kind=dp) - end do + ! update the entry + div(i, j, b) = cmplx(div_r, div_c, kind=dp) + end do - end subroutine process_spectral_div_u + end subroutine process_spectral_div_u - attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) - implicit none + attributes(global) subroutine reorder_cmplx_x2y_T(u_y, u_x, nz) + implicit none - complex(dp), device, intent(out), dimension(:, :, :) :: u_y - complex(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz + complex(dp), device, intent(out), dimension(:, :, :) :: u_y + complex(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz - complex(dp), shared :: tile(SZ, SZ) + complex(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1)) + ! copy into shared + tile(i, j) = u_x((b_i - 1)*SZ + j, i, b_k + nz*(b_j - 1)) - call syncthreads() + call syncthreads() - ! copy into output array from shared - u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i) + ! copy into output array from shared + u_y((b_j - 1)*SZ + j, i, (b_i - 1)*nz + b_k) = tile(j, i) - end subroutine reorder_cmplx_x2y_T +end subroutine reorder_cmplx_x2y_T - attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz) - implicit none +attributes(global) subroutine reorder_cmplx_y2x_T(u_x, u_y, nz) +implicit none - complex(dp), device, intent(out), dimension(:, :, :) :: u_x - complex(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nz +complex(dp), device, intent(out), dimension(:, :, :) :: u_x +complex(dp), device, intent(in), dimension(:, :, :) :: u_y +integer, value, intent(in) :: nz - complex(dp), shared :: tile(SZ, SZ) +complex(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k +integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1)) + ! copy into shared +tile(i, j) = u_y((b_j - 1)*SZ + j, i, b_k + nz*(b_i - 1)) - call syncthreads() +call syncthreads() - ! copy into output array from shared - u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i) + ! copy into output array from shared +u_x((b_i - 1)*SZ + j, i, (b_j - 1)*nz + b_k) = tile(j, i) - end subroutine reorder_cmplx_y2x_T +end subroutine reorder_cmplx_y2x_T - attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz) - implicit none +attributes(global) subroutine reorder_cmplx_y2z_T(u_z, u_y, nx, nz) +implicit none - complex(dp), device, intent(out), dimension(:, :, :) :: u_z - complex(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nx, nz +complex(dp), device, intent(out), dimension(:, :, :) :: u_z +complex(dp), device, intent(in), dimension(:, :, :) :: u_y +integer, value, intent(in) :: nx, nz - complex(dp), shared :: tile(SZ, SZ) +complex(dp), shared :: tile(SZ, SZ) - integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z +integer :: i, j, k, b_i, b_j, b_k, b_x, b_y, b_z - i = threadIdx%x - j = threadIdx%y - k = threadIdx%z +i = threadIdx%x +j = threadIdx%y +k = threadIdx%z - b_x = blockIdx%z - b_y = blockIdx%y - b_z = blockIdx%x +b_x = blockIdx%z +b_y = blockIdx%y +b_z = blockIdx%x - ! copy into shared - if ( j + (b_z - 1)*SZ <= nz ) & - tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & - j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) + ! copy into shared +if ( j + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) - call syncthreads() +call syncthreads() - ! copy into output array from shared - if ( i + (b_z - 1)*SZ <= nz ) & - u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i) + ! copy into output array from shared +if ( i + (b_z - 1)*SZ <= nz ) & + u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) = tile(j, i) - end subroutine reorder_cmplx_y2z_T +end subroutine reorder_cmplx_y2z_T - attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) - implicit none +attributes(global) subroutine reorder_cmplx_z2y_T(u_y, u_z, nx, nz) +implicit none - complex(dp), device, intent(out), dimension(:, :, :) :: u_y - complex(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nx, nz +complex(dp), device, intent(out), dimension(:, :, :) :: u_y +complex(dp), device, intent(in), dimension(:, :, :) :: u_z +integer, value, intent(in) :: nx, nz - complex(dp), shared :: tile(SZ, SZ) +complex(dp), shared :: tile(SZ, SZ) - integer :: i, j, k, b_x, b_y, b_z +integer :: i, j, k, b_x, b_y, b_z - i = threadIdx%x - j = threadIdx%y - k = threadIdx%z +i = threadIdx%x +j = threadIdx%y +k = threadIdx%z - b_x = blockIdx%z - b_y = blockIdx%y - b_z = blockIdx%x +b_x = blockIdx%z +b_y = blockIdx%y +b_z = blockIdx%x - ! copy into shared - if ( i + (b_z - 1)*SZ <= nz ) & - tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) + ! copy into shared +if ( i + (b_z - 1)*SZ <= nz ) & + tile(i, j) = u_z(i + (b_z - 1)*SZ, j, b_x + (b_y - 1)*nx) - call syncthreads() +call syncthreads() - ! copy into output array from shared - if ( j + (b_z - 1)*SZ <= nz ) & - u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & - j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i) + ! copy into output array from shared +if ( j + (b_z - 1)*SZ <= nz ) & + u_y(i + (b_y - 1)*SZ, mod(b_x - 1, SZ) + 1, & + j + (b_z - 1)*SZ + ((b_x - 1)/SZ)*nz) = tile(j, i) - end subroutine reorder_cmplx_z2y_T +end subroutine reorder_cmplx_z2y_T - attributes(global) subroutine reshapeDSF(uout, uin) - implicit none +attributes(global) subroutine reshapeDSF(uout, uin) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: uout - real(dp), device, intent(in), dimension(:, :, :) :: uin +real(dp), device, intent(out), dimension(:, :, :) :: uout +real(dp), device, intent(in), dimension(:, :, :) :: uin - real(dp), shared :: tile(SZ + 1, SZ) +real(dp), shared :: tile(SZ + 1, SZ) - integer :: i, j, b_i, b +integer :: i, j, b_i, b - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b = blockIdx%y - tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) +tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) - call syncthreads() +call syncthreads() - uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) +uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) - end subroutine reshapeDSF +end subroutine reshapeDSF - attributes(global) subroutine reshapeDSB(uout, uin) - implicit none +attributes(global) subroutine reshapeDSB(uout, uin) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: uout - real(dp), device, intent(in), dimension(:, :, :) :: uin +real(dp), device, intent(out), dimension(:, :, :) :: uout +real(dp), device, intent(in), dimension(:, :, :) :: uin - real(dp), shared :: tile(SZ + 1, SZ) +real(dp), shared :: tile(SZ + 1, SZ) - integer :: i, j, b_i, b +integer :: i, j, b_i, b - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b = blockIdx%y - tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) +tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) - call syncthreads() +call syncthreads() - uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) +uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) - end subroutine reshapeDSB +end subroutine reshapeDSB - attributes(global) subroutine reshapeCDSF(uout, uin) - implicit none +attributes(global) subroutine reshapeCDSF(uout, uin) +implicit none - complex(dp), device, intent(out), dimension(:, :, :) :: uout - complex(dp), device, intent(in), dimension(:, :, :) :: uin +complex(dp), device, intent(out), dimension(:, :, :) :: uout +complex(dp), device, intent(in), dimension(:, :, :) :: uin - complex(dp), shared :: tile(SZ + 1, SZ) +complex(dp), shared :: tile(SZ + 1, SZ) - integer :: i, j, b_i, b +integer :: i, j, b_i, b - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b = blockIdx%y - tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) +tile(i, j) = uin(i, j + (b_i - 1)*SZ, b) - call syncthreads() +call syncthreads() - uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) +uout(i + (b_i - 1)*SZ, j, b) = tile(j, i) - end subroutine reshapeCDSF +end subroutine reshapeCDSF - attributes(global) subroutine reshapeCDSB(uout, uin) - implicit none +attributes(global) subroutine reshapeCDSB(uout, uin) +implicit none - complex(dp), device, intent(out), dimension(:, :, :) :: uout - complex(dp), device, intent(in), dimension(:, :, :) :: uin +complex(dp), device, intent(out), dimension(:, :, :) :: uout +complex(dp), device, intent(in), dimension(:, :, :) :: uin - complex(dp), shared :: tile(SZ + 1, SZ) +complex(dp), shared :: tile(SZ + 1, SZ) - integer :: i, j, b_i, b +integer :: i, j, b_i, b - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b = blockIdx%y +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b = blockIdx%y - tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) +tile(i, j) = uin(i + (b_i - 1)*SZ, j, b) - call syncthreads() +call syncthreads() - uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) +uout(i, j + (b_i - 1)*SZ, b) = tile(j, i) - end subroutine reshapeCDSB +end subroutine reshapeCDSB end module m_cuda_complex diff --git a/src/cuda/kernels/distributed.f90 b/src/cuda/kernels/distributed.f90 index 6d8c53fe..7ed704b7 100644 --- a/src/cuda/kernels/distributed.f90 +++ b/src/cuda/kernels/distributed.f90 @@ -1,653 +1,653 @@ -module m_cuda_kernels_dist - use cudafor + module m_cuda_kernels_dist + use cudafor - use m_common, only: dp + use m_common, only: dp - implicit none + implicit none -contains + contains - attributes(global) subroutine der_univ_dist( & + attributes(global) subroutine der_univ_dist( & du, send_u_s, send_u_e, u, u_s, u_e, coeffs_s, coeffs_e, coeffs, n, & ffr, fbc, faf & ) - implicit none - - ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_s, & - send_u_e - real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e - real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e - real(dp), device, intent(in), dimension(:) :: coeffs - integer, value, intent(in) :: n - real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf - - ! Local variables - integer :: i, j, b, k, lj - integer :: jm2, jm1, jp1, jp2 - - real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, & - temp_du, alpha, last_r - - i = threadIdx%x - b = blockIdx%x - - ! store bulk coeffs in the registers - c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4) - c_j = coeffs(5) - c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9) - last_r = ffr(1) - - du(i, 1, b) = coeffs_s(1, 1)*u_s(i, 1, b) & - + coeffs_s(2, 1)*u_s(i, 2, b) & - + coeffs_s(3, 1)*u_s(i, 3, b) & - + coeffs_s(4, 1)*u_s(i, 4, b) & - + coeffs_s(5, 1)*u(i, 1, b) & - + coeffs_s(6, 1)*u(i, 2, b) & - + coeffs_s(7, 1)*u(i, 3, b) & - + coeffs_s(8, 1)*u(i, 4, b) & - + coeffs_s(9, 1)*u(i, 5, b) - du(i, 1, b) = du(i, 1, b)*faf(1) - du(i, 2, b) = coeffs_s(1, 2)*u_s(i, 2, b) & - + coeffs_s(2, 2)*u_s(i, 3, b) & - + coeffs_s(3, 2)*u_s(i, 4, b) & - + coeffs_s(4, 2)*u(i, 1, b) & - + coeffs_s(5, 2)*u(i, 2, b) & - + coeffs_s(6, 2)*u(i, 3, b) & - + coeffs_s(7, 2)*u(i, 4, b) & - + coeffs_s(8, 2)*u(i, 5, b) & - + coeffs_s(9, 2)*u(i, 6, b) - du(i, 2, b) = du(i, 2, b)*faf(2) - du(i, 3, b) = coeffs_s(1, 3)*u_s(i, 3, b) & - + coeffs_s(2, 3)*u_s(i, 4, b) & - + coeffs_s(3, 3)*u(i, 1, b) & - + coeffs_s(4, 3)*u(i, 2, b) & - + coeffs_s(5, 3)*u(i, 3, b) & - + coeffs_s(6, 3)*u(i, 4, b) & - + coeffs_s(7, 3)*u(i, 5, b) & - + coeffs_s(8, 3)*u(i, 6, b) & - + coeffs_s(9, 3)*u(i, 7, b) - du(i, 3, b) = ffr(3)*(du(i, 3, b) - faf(3)*du(i, 2, b)) - du(i, 4, b) = coeffs_s(1, 4)*u_s(i, 4, b) & - + coeffs_s(2, 4)*u(i, 1, b) & - + coeffs_s(3, 4)*u(i, 2, b) & - + coeffs_s(4, 4)*u(i, 3, b) & - + coeffs_s(5, 4)*u(i, 4, b) & - + coeffs_s(6, 4)*u(i, 5, b) & - + coeffs_s(7, 4)*u(i, 6, b) & - + coeffs_s(8, 4)*u(i, 7, b) & - + coeffs_s(9, 4)*u(i, 8, b) - du(i, 4, b) = ffr(4)*(du(i, 4, b) - faf(3)*du(i, 3, b)) - - alpha = faf(5) - - do j = 5, n - 4 - temp_du = c_m4*u(i, j - 4, b) + c_m3*u(i, j - 3, b) & - + c_m2*u(i, j - 2, b) + c_m1*u(i, j - 1, b) & - + c_j*u(i, j, b) & - + c_p1*u(i, j + 1, b) + c_p2*u(i, j + 2, b) & - + c_p3*u(i, j + 3, b) + c_p4*u(i, j + 4, b) - du(i, j, b) = ffr(j)*(temp_du - alpha*du(i, j - 1, b)) - end do - - j = n - 3 - du(i, j, b) = coeffs_e(1, 1)*u(i, j - 4, b) & - + coeffs_e(2, 1)*u(i, j - 3, b) & - + coeffs_e(3, 1)*u(i, j - 2, b) & - + coeffs_e(4, 1)*u(i, j - 1, b) & - + coeffs_e(5, 1)*u(i, j, b) & - + coeffs_e(6, 1)*u(i, j + 1, b) & - + coeffs_e(7, 1)*u(i, j + 2, b) & - + coeffs_e(8, 1)*u(i, j + 3, b) & - + coeffs_e(9, 1)*u_e(i, 1, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) - j = n - 2 - du(i, j, b) = coeffs_e(1, 2)*u(i, j - 4, b) & - + coeffs_e(2, 2)*u(i, j - 3, b) & - + coeffs_e(3, 2)*u(i, j - 2, b) & - + coeffs_e(4, 2)*u(i, j - 1, b) & - + coeffs_e(5, 2)*u(i, j, b) & - + coeffs_e(6, 2)*u(i, j + 1, b) & - + coeffs_e(7, 2)*u(i, j + 2, b) & - + coeffs_e(8, 2)*u_e(i, 1, b) & - + coeffs_e(9, 2)*u_e(i, 2, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) - j = n - 1 - du(i, j, b) = coeffs_e(1, 3)*u(i, j - 4, b) & - + coeffs_e(2, 3)*u(i, j - 3, b) & - + coeffs_e(3, 3)*u(i, j - 2, b) & - + coeffs_e(4, 3)*u(i, j - 1, b) & - + coeffs_e(5, 3)*u(i, j, b) & - + coeffs_e(6, 3)*u(i, j + 1, b) & - + coeffs_e(7, 3)*u_e(i, 1, b) & - + coeffs_e(8, 3)*u_e(i, 2, b) & - + coeffs_e(9, 3)*u_e(i, 3, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) - j = n - du(i, j, b) = coeffs_e(1, 4)*u(i, j - 4, b) & - + coeffs_e(2, 4)*u(i, j - 3, b) & - + coeffs_e(3, 4)*u(i, j - 2, b) & - + coeffs_e(4, 4)*u(i, j - 1, b) & - + coeffs_e(5, 4)*u(i, j, b) & - + coeffs_e(6, 4)*u_e(i, 1, b) & - + coeffs_e(7, 4)*u_e(i, 2, b) & - + coeffs_e(8, 4)*u_e(i, 3, b) & - + coeffs_e(9, 4)*u_e(i, 4, b) - du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) - - send_u_e(i, 1, b) = du(i, n, b) - - ! Backward pass of the hybrid algorithm - do j = n - 2, 2, -1 - du(i, j, b) = du(i, j, b) - fbc(j)*du(i, j + 1, b) - end do - du(i, 1, b) = last_r*(du(i, 1, b) - fbc(1)*du(i, 2, b)) - send_u_s(i, 1, b) = du(i, 1, b) - - end subroutine der_univ_dist - - attributes(global) subroutine der_univ_subs(du, recv_u_s, recv_u_e, & - n, dist_sa, dist_sc) - implicit none - - ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: du - real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e - real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc - integer, value, intent(in) :: n - - ! Local variables - integer :: i, j, b - real(dp) :: ur, bl, recp, du_s, du_e - - i = threadIdx%x - b = blockIdx%x - - ! A small trick we do here is valid for symmetric Toeplitz matrices. - ! In our case our matrices satisfy this criteria in the (5:n-4) region - ! and as long as a rank has around at least 20 entries the assumptions - ! we make here are perfectly valid. - - ! bl is the bottom left entry in the 2x2 matrix - ! ur is the upper right entry in the 2x2 matrix - - ! Start - ! At the start we have the 'bl', and assume 'ur' - bl = dist_sa(1) - ur = dist_sa(1) - recp = 1._dp/(1._dp - ur*bl) - du_s = recp*(du(i, 1, b) - bl*recv_u_s(i, 1, b)) - - ! End - ! At the end we have the 'ur', and assume 'bl' - bl = dist_sc(n) - ur = dist_sc(n) - recp = 1._dp/(1._dp - ur*bl) - du_e = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b)) - - du(i, 1, b) = du_s - do j = 2, n - 1 - du(i, j, b) = (du(i, j, b) - dist_sa(j)*du_s - dist_sc(j)*du_e) - end do - du(i, n, b) = du_e - - end subroutine der_univ_subs - - attributes(global) subroutine transeq_3fused_dist( & - du, dud, d2u, & - send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e, & - u, u_s, u_e, v, v_s, v_e, n, & - d1_coeffs_s, d1_coeffs_e, d1_coeffs, d1_fw, d1_bw, d1_af, & - d2_coeffs_s, d2_coeffs_e, d2_coeffs, d2_fw, d2_bw, d2_af & - ) - implicit none - - ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u - real(dp), device, intent(out), dimension(:, :, :) :: & - send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e - real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e, & - v, v_s, v_e - integer, value, intent(in) :: n - real(dp), device, intent(in) :: d1_coeffs_s(:, :), d1_coeffs_e(:, :), & - d1_coeffs(:) - real(dp), device, intent(in) :: d1_fw(:), d1_bw(:), d1_af(:) - real(dp), device, intent(in) :: d2_coeffs_s(:, :), d2_coeffs_e(:, :), & - d2_coeffs(:) - real(dp), device, intent(in) :: d2_fw(:), d2_bw(:), d2_af(:) - - ! Local variables - integer :: i, j, b - - real(dp) :: d1_c_m4, d1_c_m3, d1_c_m2, d1_c_m1, d1_c_j, & - d1_c_p1, d1_c_p2, d1_c_p3, d1_c_p4, & - d1_alpha, d1_last_r - real(dp) :: d2_c_m4, d2_c_m3, d2_c_m2, d2_c_m1, d2_c_j, & - d2_c_p1, d2_c_p2, d2_c_p3, d2_c_p4, & - d2_alpha, d2_last_r - real(dp) :: temp_du, temp_dud, temp_d2u - real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4 - real(dp) :: v_m4, v_m3, v_m2, v_m1, v_j, v_p1, v_p2, v_p3, v_p4 - real(dp) :: old_du, old_dud, old_d2u - - i = threadIdx%x - b = blockIdx%x - - d1_last_r = d1_fw(1) - d2_last_r = d2_fw(1) - - ! j = 1 - temp_du = d1_coeffs_s(1, 1)*u_s(i, 1, b) & - + d1_coeffs_s(2, 1)*u_s(i, 2, b) & - + d1_coeffs_s(3, 1)*u_s(i, 3, b) & - + d1_coeffs_s(4, 1)*u_s(i, 4, b) & - + d1_coeffs_s(5, 1)*u(i, 1, b) & - + d1_coeffs_s(6, 1)*u(i, 2, b) & - + d1_coeffs_s(7, 1)*u(i, 3, b) & - + d1_coeffs_s(8, 1)*u(i, 4, b) & - + d1_coeffs_s(9, 1)*u(i, 5, b) - du(i, 1, b) = temp_du*d1_af(1) - temp_dud = d1_coeffs_s(1, 1)*u_s(i, 1, b)*v_s(i, 1, b) & - + d1_coeffs_s(2, 1)*u_s(i, 2, b)*v_s(i, 2, b) & - + d1_coeffs_s(3, 1)*u_s(i, 3, b)*v_s(i, 3, b) & - + d1_coeffs_s(4, 1)*u_s(i, 4, b)*v_s(i, 4, b) & - + d1_coeffs_s(5, 1)*u(i, 1, b)*v(i, 1, b) & - + d1_coeffs_s(6, 1)*u(i, 2, b)*v(i, 2, b) & - + d1_coeffs_s(7, 1)*u(i, 3, b)*v(i, 3, b) & - + d1_coeffs_s(8, 1)*u(i, 4, b)*v(i, 4, b) & - + d1_coeffs_s(9, 1)*u(i, 5, b)*v(i, 5, b) - dud(i, 1, b) = temp_dud*d1_af(1) - temp_d2u = d2_coeffs_s(1, 1)*u_s(i, 1, b) & - + d2_coeffs_s(2, 1)*u_s(i, 2, b) & - + d2_coeffs_s(3, 1)*u_s(i, 3, b) & - + d2_coeffs_s(4, 1)*u_s(i, 4, b) & - + d2_coeffs_s(5, 1)*u(i, 1, b) & - + d2_coeffs_s(6, 1)*u(i, 2, b) & - + d2_coeffs_s(7, 1)*u(i, 3, b) & - + d2_coeffs_s(8, 1)*u(i, 4, b) & - + d2_coeffs_s(9, 1)*u(i, 5, b) - d2u(i, 1, b) = temp_d2u*d2_af(1) - ! j = 2 - temp_du = d1_coeffs_s(1, 2)*u_s(i, 2, b) & - + d1_coeffs_s(2, 2)*u_s(i, 3, b) & - + d1_coeffs_s(3, 2)*u_s(i, 4, b) & - + d1_coeffs_s(4, 2)*u(i, 1, b) & - + d1_coeffs_s(5, 2)*u(i, 2, b) & - + d1_coeffs_s(6, 2)*u(i, 3, b) & - + d1_coeffs_s(7, 2)*u(i, 4, b) & - + d1_coeffs_s(8, 2)*u(i, 5, b) & - + d1_coeffs_s(9, 2)*u(i, 6, b) - du(i, 2, b) = temp_du*d1_af(2) - temp_dud = d1_coeffs_s(1, 2)*u_s(i, 2, b)*v_s(i, 2, b) & - + d1_coeffs_s(2, 2)*u_s(i, 3, b)*v_s(i, 3, b) & - + d1_coeffs_s(3, 2)*u_s(i, 4, b)*v_s(i, 4, b) & - + d1_coeffs_s(4, 2)*u(i, 1, b)*v(i, 1, b) & - + d1_coeffs_s(5, 2)*u(i, 2, b)*v(i, 2, b) & - + d1_coeffs_s(6, 2)*u(i, 3, b)*v(i, 3, b) & - + d1_coeffs_s(7, 2)*u(i, 4, b)*v(i, 4, b) & - + d1_coeffs_s(8, 2)*u(i, 5, b)*v(i, 5, b) & - + d1_coeffs_s(9, 2)*u(i, 6, b)*v(i, 6, b) - dud(i, 2, b) = temp_dud*d1_af(2) - temp_d2u = d2_coeffs_s(1, 2)*u_s(i, 2, b) & - + d2_coeffs_s(2, 2)*u_s(i, 3, b) & - + d2_coeffs_s(3, 2)*u_s(i, 4, b) & - + d2_coeffs_s(4, 2)*u(i, 1, b) & - + d2_coeffs_s(5, 2)*u(i, 2, b) & - + d2_coeffs_s(6, 2)*u(i, 3, b) & - + d2_coeffs_s(7, 2)*u(i, 4, b) & - + d2_coeffs_s(8, 2)*u(i, 5, b) & - + d2_coeffs_s(9, 2)*u(i, 6, b) - d2u(i, 2, b) = temp_d2u*d2_af(2) - ! j = 3 - temp_du = d1_coeffs_s(1, 3)*u_s(i, 3, b) & - + d1_coeffs_s(2, 3)*u_s(i, 4, b) & - + d1_coeffs_s(3, 3)*u(i, 1, b) & - + d1_coeffs_s(4, 3)*u(i, 2, b) & - + d1_coeffs_s(5, 3)*u(i, 3, b) & - + d1_coeffs_s(6, 3)*u(i, 4, b) & - + d1_coeffs_s(7, 3)*u(i, 5, b) & - + d1_coeffs_s(8, 3)*u(i, 6, b) & - + d1_coeffs_s(9, 3)*u(i, 7, b) - du(i, 3, b) = d1_fw(3)*(temp_du - d1_af(3)*du(i, 2, b)) - temp_dud = d1_coeffs_s(1, 3)*u_s(i, 3, b)*v_s(i, 3, b) & - + d1_coeffs_s(2, 3)*u_s(i, 4, b)*v_s(i, 4, b) & - + d1_coeffs_s(3, 3)*u(i, 1, b)*v(i, 1, b) & - + d1_coeffs_s(4, 3)*u(i, 2, b)*v(i, 2, b) & - + d1_coeffs_s(5, 3)*u(i, 3, b)*v(i, 3, b) & - + d1_coeffs_s(6, 3)*u(i, 4, b)*v(i, 4, b) & - + d1_coeffs_s(7, 3)*u(i, 5, b)*v(i, 5, b) & - + d1_coeffs_s(8, 3)*u(i, 6, b)*v(i, 6, b) & - + d1_coeffs_s(9, 3)*u(i, 7, b)*v(i, 7, b) - dud(i, 3, b) = d1_fw(3)*(temp_dud - d1_af(3)*dud(i, 2, b)) - temp_d2u = d2_coeffs_s(1, 3)*u_s(i, 3, b) & - + d2_coeffs_s(2, 3)*u_s(i, 4, b) & - + d2_coeffs_s(3, 3)*u(i, 1, b) & - + d2_coeffs_s(4, 3)*u(i, 2, b) & - + d2_coeffs_s(5, 3)*u(i, 3, b) & - + d2_coeffs_s(6, 3)*u(i, 4, b) & - + d2_coeffs_s(7, 3)*u(i, 5, b) & - + d2_coeffs_s(8, 3)*u(i, 6, b) & - + d2_coeffs_s(9, 3)*u(i, 7, b) - d2u(i, 3, b) = d2_fw(3)*(temp_d2u - d2_af(3)*d2u(i, 2, b)) - ! j = 4 - temp_du = d1_coeffs_s(1, 4)*u_s(i, 4, b) & - + d1_coeffs_s(2, 4)*u(i, 1, b) & - + d1_coeffs_s(3, 4)*u(i, 2, b) & - + d1_coeffs_s(4, 4)*u(i, 3, b) & - + d1_coeffs_s(5, 4)*u(i, 4, b) & - + d1_coeffs_s(6, 4)*u(i, 5, b) & - + d1_coeffs_s(7, 4)*u(i, 6, b) & - + d1_coeffs_s(8, 4)*u(i, 7, b) & - + d1_coeffs_s(9, 4)*u(i, 8, b) - du(i, 4, b) = d1_fw(4)*(temp_du - d1_af(3)*du(i, 3, b)) - temp_dud = d1_coeffs_s(1, 4)*u_s(i, 4, b)*v_s(i, 4, b) & - + d1_coeffs_s(2, 4)*u(i, 1, b)*v(i, 1, b) & - + d1_coeffs_s(3, 4)*u(i, 2, b)*v(i, 2, b) & - + d1_coeffs_s(4, 4)*u(i, 3, b)*v(i, 3, b) & - + d1_coeffs_s(5, 4)*u(i, 4, b)*v(i, 4, b) & - + d1_coeffs_s(6, 4)*u(i, 5, b)*v(i, 5, b) & - + d1_coeffs_s(7, 4)*u(i, 6, b)*v(i, 6, b) & - + d1_coeffs_s(8, 4)*u(i, 7, b)*v(i, 7, b) & - + d1_coeffs_s(9, 4)*u(i, 8, b)*v(i, 8, b) - dud(i, 4, b) = d1_fw(4)*(temp_dud - d1_af(3)*dud(i, 3, b)) - temp_d2u = d2_coeffs_s(1, 4)*u_s(i, 4, b) & - + d2_coeffs_s(2, 4)*u(i, 1, b) & - + d2_coeffs_s(3, 4)*u(i, 2, b) & - + d2_coeffs_s(4, 4)*u(i, 3, b) & - + d2_coeffs_s(5, 4)*u(i, 4, b) & - + d2_coeffs_s(6, 4)*u(i, 5, b) & - + d2_coeffs_s(7, 4)*u(i, 6, b) & - + d2_coeffs_s(8, 4)*u(i, 7, b) & - + d2_coeffs_s(9, 4)*u(i, 8, b) - d2u(i, 4, b) = d2_fw(4)*(temp_d2u - d2_af(3)*d2u(i, 3, b)) - - d1_alpha = d1_af(5) - d2_alpha = d2_af(5) - - ! store bulk coeffs in the registers - d1_c_m4 = d1_coeffs(1); d1_c_m3 = d1_coeffs(2) - d1_c_m2 = d1_coeffs(3); d1_c_m1 = d1_coeffs(4) - d1_c_j = d1_coeffs(5) - d1_c_p1 = d1_coeffs(6); d1_c_p2 = d1_coeffs(7) - d1_c_p3 = d1_coeffs(8); d1_c_p4 = d1_coeffs(9) - - d2_c_m4 = d2_coeffs(1); d2_c_m3 = d2_coeffs(2) - d2_c_m2 = d2_coeffs(3); d2_c_m1 = d2_coeffs(4) - d2_c_j = d2_coeffs(5) - d2_c_p1 = d2_coeffs(6); d2_c_p2 = d2_coeffs(7) - d2_c_p3 = d2_coeffs(8); d2_c_p4 = d2_coeffs(9) - - ! It is better to access d?(i, j - 1, b) via old_d? - old_du = du(i, 4, b) - old_dud = dud(i, 4, b) - old_d2u = d2u(i, 4, b) - - ! Populate registers with the u and v stencils - u_m4 = u(i, 1, b); u_m3 = u(i, 2, b) - u_m2 = u(i, 3, b); u_m1 = u(i, 4, b) - u_j = u(i, 5, b); u_p1 = u(i, 6, b) - u_p2 = u(i, 7, b); u_p3 = u(i, 8, b) - v_m4 = v(i, 1, b); v_m3 = v(i, 2, b) - v_m2 = v(i, 3, b); v_m1 = v(i, 4, b) - v_j = v(i, 5, b); v_p1 = v(i, 6, b) - v_p2 = v(i, 7, b); v_p3 = v(i, 8, b) - - do j = 5, n - 4 - u_p4 = u(i, j+4, b); v_p4 = v(i, j+4, b) - - ! du - temp_du = d1_c_m4*u_m4 + d1_c_m3*u_m3 + d1_c_m2*u_m2 + d1_c_m1*u_m1 & - + d1_c_j*u_j & - + d1_c_p1*u_p1 + d1_c_p2*u_p2 + d1_c_p3*u_p3 + d1_c_p4*u_p4 - du(i, j, b) = d1_fw(j)*(temp_du - d1_alpha*old_du) - old_du = du(i, j, b) - - ! dud - temp_dud = d1_c_m4*u_m4*v_m4 + d1_c_m3*u_m3*v_m3 & - + d1_c_m2*u_m2*v_m2 + d1_c_m1*u_m1*v_m1 & - + d1_c_j*u_j*v_j & - + d1_c_p1*u_p1*v_p1 + d1_c_p2*u_p2*v_p2 & - + d1_c_p3*u_p3*v_p3 + d1_c_p4*u_p4*v_p4 - dud(i, j, b) = d1_fw(j)*(temp_dud - d1_alpha*old_dud) - old_dud = dud(i, j, b) - - ! d2u - temp_d2u = d2_c_m4*u_m4 + d2_c_m3*u_m3 + d2_c_m2*u_m2 + d2_c_m1*u_m1 & - + d2_c_j*u_j & - + d2_c_p1*u_p1 + d2_c_p2*u_p2 + d2_c_p3*u_p3 + d2_c_p4*u_p4 - d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_alpha*old_d2u) - old_d2u = d2u(i, j, b) - - ! Prepare registers for the next step - u_m4 = u_m3; u_m3 = u_m2; u_m2 = u_m1; u_m1 = u_j - u_j = u_p1; u_p1 = u_p2; u_p2 = u_p3; u_p3 = u_p4 - v_m4 = v_m3; v_m3 = v_m2; v_m2 = v_m1; v_m1 = v_j - v_j = v_p1; v_p1 = v_p2; v_p2 = v_p3; v_p3 = v_p4 - end do - - j = n - 3 - temp_du = d1_coeffs_e(1, 1)*u(i, j - 4, b) & - + d1_coeffs_e(2, 1)*u(i, j - 3, b) & - + d1_coeffs_e(3, 1)*u(i, j - 2, b) & - + d1_coeffs_e(4, 1)*u(i, j - 1, b) & - + d1_coeffs_e(5, 1)*u(i, j, b) & - + d1_coeffs_e(6, 1)*u(i, j + 1, b) & - + d1_coeffs_e(7, 1)*u(i, j + 2, b) & - + d1_coeffs_e(8, 1)*u(i, j + 3, b) & - + d1_coeffs_e(9, 1)*u_e(i, 1, b) - du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) - temp_dud = d1_coeffs_e(1, 1)*u(i, j - 4, b)*v(i, j - 4, b) & - + d1_coeffs_e(2, 1)*u(i, j - 3, b)*v(i, j - 3, b) & - + d1_coeffs_e(3, 1)*u(i, j - 2, b)*v(i, j - 2, b) & - + d1_coeffs_e(4, 1)*u(i, j - 1, b)*v(i, j - 1, b) & - + d1_coeffs_e(5, 1)*u(i, j, b)*v(i, j, b) & - + d1_coeffs_e(6, 1)*u(i, j + 1, b)*v(i, j + 1, b) & - + d1_coeffs_e(7, 1)*u(i, j + 2, b)*v(i, j + 2, b) & - + d1_coeffs_e(8, 1)*u(i, j + 3, b)*v(i, j + 3, b) & - + d1_coeffs_e(9, 1)*u_e(i, 1, b)*v_e(i, 1, b) - dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) - temp_d2u = d1_coeffs_e(1, 1)*u(i, j - 4, b) & - + d2_coeffs_e(2, 1)*u(i, j - 3, b) & - + d2_coeffs_e(3, 1)*u(i, j - 2, b) & - + d2_coeffs_e(4, 1)*u(i, j - 1, b) & - + d2_coeffs_e(5, 1)*u(i, j, b) & - + d2_coeffs_e(6, 1)*u(i, j + 1, b) & - + d2_coeffs_e(7, 1)*u(i, j + 2, b) & - + d2_coeffs_e(8, 1)*u(i, j + 3, b) & - + d2_coeffs_e(9, 1)*u_e(i, 1, b) - d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) - j = n - 2 - temp_du = d1_coeffs_e(1, 2)*u(i, j - 4, b) & - + d1_coeffs_e(2, 2)*u(i, j - 3, b) & - + d1_coeffs_e(3, 2)*u(i, j - 2, b) & - + d1_coeffs_e(4, 2)*u(i, j - 1, b) & - + d1_coeffs_e(5, 2)*u(i, j, b) & - + d1_coeffs_e(6, 2)*u(i, j + 1, b) & - + d1_coeffs_e(7, 2)*u(i, j + 2, b) & - + d1_coeffs_e(8, 2)*u_e(i, 1, b) & - + d1_coeffs_e(9, 2)*u_e(i, 2, b) - du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) - temp_dud = d1_coeffs_e(1, 2)*u(i, j - 4, b)*v(i, j - 4, b) & - + d1_coeffs_e(2, 2)*u(i, j - 3, b)*v(i, j - 3, b) & - + d1_coeffs_e(3, 2)*u(i, j - 2, b)*v(i, j - 2, b) & - + d1_coeffs_e(4, 2)*u(i, j - 1, b)*v(i, j - 1, b) & - + d1_coeffs_e(5, 2)*u(i, j, b)*v(i, j, b) & - + d1_coeffs_e(6, 2)*u(i, j + 1, b)*v(i, j + 1, b) & - + d1_coeffs_e(7, 2)*u(i, j + 2, b)*v(i, j + 2, b) & - + d1_coeffs_e(8, 2)*u_e(i, 1, b)*v_e(i, 1, b) & - + d1_coeffs_e(9, 2)*u_e(i, 2, b)*v_e(i, 2, b) - dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) - temp_d2u = d2_coeffs_e(1, 2)*u(i, j - 4, b) & - + d2_coeffs_e(2, 2)*u(i, j - 3, b) & - + d2_coeffs_e(3, 2)*u(i, j - 2, b) & - + d2_coeffs_e(4, 2)*u(i, j - 1, b) & - + d2_coeffs_e(5, 2)*u(i, j, b) & - + d2_coeffs_e(6, 2)*u(i, j + 1, b) & - + d2_coeffs_e(7, 2)*u(i, j + 2, b) & - + d2_coeffs_e(8, 2)*u_e(i, 1, b) & - + d2_coeffs_e(9, 2)*u_e(i, 2, b) - d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) - j = n - 1 - temp_du = d1_coeffs_e(1, 3)*u(i, j - 4, b) & - + d1_coeffs_e(2, 3)*u(i, j - 3, b) & - + d1_coeffs_e(3, 3)*u(i, j - 2, b) & - + d1_coeffs_e(4, 3)*u(i, j - 1, b) & - + d1_coeffs_e(5, 3)*u(i, j, b) & - + d1_coeffs_e(6, 3)*u(i, j + 1, b) & - + d1_coeffs_e(7, 3)*u_e(i, 1, b) & - + d1_coeffs_e(8, 3)*u_e(i, 2, b) & - + d1_coeffs_e(9, 3)*u_e(i, 3, b) - du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) - temp_dud = d1_coeffs_e(1, 3)*u(i, j - 4, b)*v(i, j - 4, b) & - + d1_coeffs_e(2, 3)*u(i, j - 3, b)*v(i, j - 3, b) & - + d1_coeffs_e(3, 3)*u(i, j - 2, b)*v(i, j - 2, b) & - + d1_coeffs_e(4, 3)*u(i, j - 1, b)*v(i, j - 1, b) & - + d1_coeffs_e(5, 3)*u(i, j, b)*v(i, j, b) & - + d1_coeffs_e(6, 3)*u(i, j + 1, b)*v(i, j + 1, b) & - + d1_coeffs_e(7, 3)*u_e(i, 1, b)*v_e(i, 1, b) & - + d1_coeffs_e(8, 3)*u_e(i, 2, b)*v_e(i, 2, b) & - + d1_coeffs_e(9, 3)*u_e(i, 3, b)*v_e(i, 3, b) - dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) - temp_d2u = d2_coeffs_e(1, 3)*u(i, j - 4, b) & - + d2_coeffs_e(2, 3)*u(i, j - 3, b) & - + d2_coeffs_e(3, 3)*u(i, j - 2, b) & - + d2_coeffs_e(4, 3)*u(i, j - 1, b) & - + d2_coeffs_e(5, 3)*u(i, j, b) & - + d2_coeffs_e(6, 3)*u(i, j + 1, b) & - + d2_coeffs_e(7, 3)*u_e(i, 1, b) & - + d2_coeffs_e(8, 3)*u_e(i, 2, b) & - + d2_coeffs_e(9, 3)*u_e(i, 3, b) - d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) - j = n - temp_du = d1_coeffs_e(1, 4)*u(i, j - 4, b) & - + d1_coeffs_e(2, 4)*u(i, j - 3, b) & - + d1_coeffs_e(3, 4)*u(i, j - 2, b) & - + d1_coeffs_e(4, 4)*u(i, j - 1, b) & - + d1_coeffs_e(5, 4)*u(i, j, b) & - + d1_coeffs_e(6, 4)*u_e(i, 1, b) & - + d1_coeffs_e(7, 4)*u_e(i, 2, b) & - + d1_coeffs_e(8, 4)*u_e(i, 3, b) & - + d1_coeffs_e(9, 4)*u_e(i, 4, b) - du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) - temp_dud = d1_coeffs_e(1, 4)*u(i, j - 4, b)*v(i, j - 4, b) & - + d1_coeffs_e(2, 4)*u(i, j - 3, b)*v(i, j - 3, b) & - + d1_coeffs_e(3, 4)*u(i, j - 2, b)*v(i, j - 2, b) & - + d1_coeffs_e(4, 4)*u(i, j - 1, b)*v(i, j - 1, b) & - + d1_coeffs_e(5, 4)*u(i, j, b)*v(i, j, b) & - + d1_coeffs_e(6, 4)*u_e(i, 1, b)*v_e(i, 1, b) & - + d1_coeffs_e(7, 4)*u_e(i, 2, b)*v_e(i, 2, b) & - + d1_coeffs_e(8, 4)*u_e(i, 3, b)*v_e(i, 3, b) & - + d1_coeffs_e(9, 4)*u_e(i, 4, b)*v_e(i, 4, b) - dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) - temp_d2u = d2_coeffs_e(1, 4)*u(i, j - 4, b) & - + d2_coeffs_e(2, 4)*u(i, j - 3, b) & - + d2_coeffs_e(3, 4)*u(i, j - 2, b) & - + d2_coeffs_e(4, 4)*u(i, j - 1, b) & - + d2_coeffs_e(5, 4)*u(i, j, b) & - + d2_coeffs_e(6, 4)*u_e(i, 1, b) & - + d2_coeffs_e(7, 4)*u_e(i, 2, b) & - + d2_coeffs_e(8, 4)*u_e(i, 3, b) & - + d2_coeffs_e(9, 4)*u_e(i, 4, b) - d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) - - send_du_e(i, 1, b) = du(i, n, b) - send_dud_e(i, 1, b) = dud(i, n, b) - send_d2u_e(i, 1, b) = d2u(i, n, b) - - ! Backward pass of the hybrid algorithm - do j = n - 2, 2, -1 - du(i, j, b) = du(i, j, b) - d1_bw(j)*du(i, j + 1, b) - dud(i, j, b) = dud(i, j, b) - d1_bw(j)*dud(i, j + 1, b) - d2u(i, j, b) = d2u(i, j, b) - d2_bw(j)*d2u(i, j + 1, b) - end do - du(i, 1, b) = d1_last_r*(du(i, 1, b) - d1_bw(1)*du(i, 2, b)) - dud(i, 1, b) = d1_last_r*(dud(i, 1, b) - d1_bw(1)*dud(i, 2, b)) - d2u(i, 1, b) = d2_last_r*(d2u(i, 1, b) - d2_bw(1)*d2u(i, 2, b)) - - send_du_s(i, 1, b) = du(i, 1, b) - send_dud_s(i, 1, b) = dud(i, 1, b) - send_d2u_s(i, 1, b) = d2u(i, 1, b) - - end subroutine transeq_3fused_dist - - attributes(global) subroutine transeq_3fused_subs( & - r_u, conv, du, dud, d2u, & - recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e, & - d1_sa, d1_sc, d2_sa, d2_sc, n, nu & - ) - implicit none - - ! Arguments - real(dp), device, intent(out), dimension(:, :, :) :: r_u - real(dp), device, intent(in), dimension(:, :, :) :: conv, du, dud, d2u - real(dp), device, intent(in), dimension(:, :, :) :: & - recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e - real(dp), device, intent(in), dimension(:) :: d1_sa, d1_sc, d2_sa, d2_sc - integer, value, intent(in) :: n - real(dp), value, intent(in) :: nu - - ! Local variables - integer :: i, j, b - real(dp) :: ur, bl, recp - real(dp) :: du_temp, dud_temp, d2u_temp - real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e - - i = threadIdx%x - b = blockIdx%x - - ! A small trick we do here is valid for symmetric Toeplitz matrices. - ! In our case our matrices satisfy this criteria in the (5:n-4) region - ! and as long as a rank has around at least 20 entries the assumptions - ! we make here are perfectly valid. - - ! bl is the bottom left entry in the 2x2 matrix - ! ur is the upper right entry in the 2x2 matrix - - ! Start - ! At the start we have the 'bl', and assume 'ur' - ! first derivative - bl = d1_sa(1) - ur = d1_sa(1) - recp = 1._dp/(1._dp - ur*bl) - - du_s = recp*(du(i, 1, b) - bl*recv_du_s(i, 1, b)) - dud_s = recp*(dud(i, 1, b) - bl*recv_dud_s(i, 1, b)) - - ! second derivative - bl = d2_sa(1) - ur = d2_sa(1) - recp = 1._dp/(1._dp - ur*bl) - - d2u_s = recp*(d2u(i, 1, b) - bl*recv_d2u_s(i, 1, b)) - - ! End - ! At the end we have the 'ur', and assume 'bl' - ! first derivative - bl = d1_sc(n) - ur = d1_sc(n) - recp = 1._dp/(1._dp - ur*bl) - - du_e = recp*(du(i, n, b) - ur*recv_du_e(i, 1, b)) - dud_e = recp*(dud(i, n, b) - ur*recv_dud_e(i, 1, b)) - - ! second derivative - bl = d2_sc(n) - ur = d2_sc(n) - recp = 1._dp/(1._dp - ur*bl) - - d2u_e = recp*(d2u(i, n, b) - ur*recv_d2u_e(i, 1, b)) - - ! final substitution - r_u(i, 1, b) = -0.5_dp*(conv(i, 1, b)*du_s + dud_s) + nu*d2u_s - do j = 2, n - 1 - du_temp = (du(i, j, b) - d1_sa(j)*du_s - d1_sc(j)*du_e) - dud_temp = (dud(i, j, b) - d1_sa(j)*dud_s - d1_sc(j)*dud_e) - d2u_temp = (d2u(i, j, b) - d2_sa(j)*d2u_s - d2_sc(j)*d2u_e) - r_u(i, j, b) = -0.5_dp*(conv(i, j, b)*du_temp + dud_temp) & - + nu*d2u_temp - end do - r_u(i, n, b) = -0.5_dp*(conv(i, n, b)*du_e + dud_e) + nu*d2u_e - - end subroutine transeq_3fused_subs + implicit none + + ! Arguments + real(dp), device, intent(out), dimension(:, :, :) :: du, send_u_s, & + send_u_e + real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e + real(dp), device, intent(in), dimension(:, :) :: coeffs_s, coeffs_e + real(dp), device, intent(in), dimension(:) :: coeffs + integer, value, intent(in) :: n + real(dp), device, intent(in), dimension(:) :: ffr, fbc, faf + + ! Local variables + integer :: i, j, b, k, lj + integer :: jm2, jm1, jp1, jp2 + + real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, & + temp_du, alpha, last_r + + i = threadIdx%x + b = blockIdx%x + + ! store bulk coeffs in the registers + c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4) + c_j = coeffs(5) + c_p1 = coeffs(6); c_p2 = coeffs(7); c_p3 = coeffs(8); c_p4 = coeffs(9) + last_r = ffr(1) + + du(i, 1, b) = coeffs_s(1, 1)*u_s(i, 1, b) & + + coeffs_s(2, 1)*u_s(i, 2, b) & + + coeffs_s(3, 1)*u_s(i, 3, b) & + + coeffs_s(4, 1)*u_s(i, 4, b) & + + coeffs_s(5, 1)*u(i, 1, b) & + + coeffs_s(6, 1)*u(i, 2, b) & + + coeffs_s(7, 1)*u(i, 3, b) & + + coeffs_s(8, 1)*u(i, 4, b) & + + coeffs_s(9, 1)*u(i, 5, b) + du(i, 1, b) = du(i, 1, b)*faf(1) + du(i, 2, b) = coeffs_s(1, 2)*u_s(i, 2, b) & + + coeffs_s(2, 2)*u_s(i, 3, b) & + + coeffs_s(3, 2)*u_s(i, 4, b) & + + coeffs_s(4, 2)*u(i, 1, b) & + + coeffs_s(5, 2)*u(i, 2, b) & + + coeffs_s(6, 2)*u(i, 3, b) & + + coeffs_s(7, 2)*u(i, 4, b) & + + coeffs_s(8, 2)*u(i, 5, b) & + + coeffs_s(9, 2)*u(i, 6, b) + du(i, 2, b) = du(i, 2, b)*faf(2) + du(i, 3, b) = coeffs_s(1, 3)*u_s(i, 3, b) & + + coeffs_s(2, 3)*u_s(i, 4, b) & + + coeffs_s(3, 3)*u(i, 1, b) & + + coeffs_s(4, 3)*u(i, 2, b) & + + coeffs_s(5, 3)*u(i, 3, b) & + + coeffs_s(6, 3)*u(i, 4, b) & + + coeffs_s(7, 3)*u(i, 5, b) & + + coeffs_s(8, 3)*u(i, 6, b) & + + coeffs_s(9, 3)*u(i, 7, b) + du(i, 3, b) = ffr(3)*(du(i, 3, b) - faf(3)*du(i, 2, b)) + du(i, 4, b) = coeffs_s(1, 4)*u_s(i, 4, b) & + + coeffs_s(2, 4)*u(i, 1, b) & + + coeffs_s(3, 4)*u(i, 2, b) & + + coeffs_s(4, 4)*u(i, 3, b) & + + coeffs_s(5, 4)*u(i, 4, b) & + + coeffs_s(6, 4)*u(i, 5, b) & + + coeffs_s(7, 4)*u(i, 6, b) & + + coeffs_s(8, 4)*u(i, 7, b) & + + coeffs_s(9, 4)*u(i, 8, b) + du(i, 4, b) = ffr(4)*(du(i, 4, b) - faf(3)*du(i, 3, b)) + + alpha = faf(5) + + do j = 5, n - 4 + temp_du = c_m4*u(i, j - 4, b) + c_m3*u(i, j - 3, b) & + + c_m2*u(i, j - 2, b) + c_m1*u(i, j - 1, b) & + + c_j*u(i, j, b) & + + c_p1*u(i, j + 1, b) + c_p2*u(i, j + 2, b) & + + c_p3*u(i, j + 3, b) + c_p4*u(i, j + 4, b) + du(i, j, b) = ffr(j)*(temp_du - alpha*du(i, j - 1, b)) + end do + + j = n - 3 + du(i, j, b) = coeffs_e(1, 1)*u(i, j - 4, b) & + + coeffs_e(2, 1)*u(i, j - 3, b) & + + coeffs_e(3, 1)*u(i, j - 2, b) & + + coeffs_e(4, 1)*u(i, j - 1, b) & + + coeffs_e(5, 1)*u(i, j, b) & + + coeffs_e(6, 1)*u(i, j + 1, b) & + + coeffs_e(7, 1)*u(i, j + 2, b) & + + coeffs_e(8, 1)*u(i, j + 3, b) & + + coeffs_e(9, 1)*u_e(i, 1, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) + j = n - 2 + du(i, j, b) = coeffs_e(1, 2)*u(i, j - 4, b) & + + coeffs_e(2, 2)*u(i, j - 3, b) & + + coeffs_e(3, 2)*u(i, j - 2, b) & + + coeffs_e(4, 2)*u(i, j - 1, b) & + + coeffs_e(5, 2)*u(i, j, b) & + + coeffs_e(6, 2)*u(i, j + 1, b) & + + coeffs_e(7, 2)*u(i, j + 2, b) & + + coeffs_e(8, 2)*u_e(i, 1, b) & + + coeffs_e(9, 2)*u_e(i, 2, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) + j = n - 1 + du(i, j, b) = coeffs_e(1, 3)*u(i, j - 4, b) & + + coeffs_e(2, 3)*u(i, j - 3, b) & + + coeffs_e(3, 3)*u(i, j - 2, b) & + + coeffs_e(4, 3)*u(i, j - 1, b) & + + coeffs_e(5, 3)*u(i, j, b) & + + coeffs_e(6, 3)*u(i, j + 1, b) & + + coeffs_e(7, 3)*u_e(i, 1, b) & + + coeffs_e(8, 3)*u_e(i, 2, b) & + + coeffs_e(9, 3)*u_e(i, 3, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) + j = n + du(i, j, b) = coeffs_e(1, 4)*u(i, j - 4, b) & + + coeffs_e(2, 4)*u(i, j - 3, b) & + + coeffs_e(3, 4)*u(i, j - 2, b) & + + coeffs_e(4, 4)*u(i, j - 1, b) & + + coeffs_e(5, 4)*u(i, j, b) & + + coeffs_e(6, 4)*u_e(i, 1, b) & + + coeffs_e(7, 4)*u_e(i, 2, b) & + + coeffs_e(8, 4)*u_e(i, 3, b) & + + coeffs_e(9, 4)*u_e(i, 4, b) + du(i, j, b) = ffr(j)*(du(i, j, b) - faf(j)*du(i, j - 1, b)) + + send_u_e(i, 1, b) = du(i, n, b) + + ! Backward pass of the hybrid algorithm + do j = n - 2, 2, -1 + du(i, j, b) = du(i, j, b) - fbc(j)*du(i, j + 1, b) + end do + du(i, 1, b) = last_r*(du(i, 1, b) - fbc(1)*du(i, 2, b)) + send_u_s(i, 1, b) = du(i, 1, b) + + end subroutine der_univ_dist + + attributes(global) subroutine der_univ_subs(du, recv_u_s, recv_u_e, & + n, dist_sa, dist_sc) + implicit none + + ! Arguments + real(dp), device, intent(out), dimension(:, :, :) :: du + real(dp), device, intent(in), dimension(:, :, :) :: recv_u_s, recv_u_e + real(dp), device, intent(in), dimension(:) :: dist_sa, dist_sc + integer, value, intent(in) :: n + + ! Local variables + integer :: i, j, b + real(dp) :: ur, bl, recp, du_s, du_e + + i = threadIdx%x + b = blockIdx%x + + ! A small trick we do here is valid for symmetric Toeplitz matrices. + ! In our case our matrices satisfy this criteria in the (5:n-4) region + ! and as long as a rank has around at least 20 entries the assumptions + ! we make here are perfectly valid. + + ! bl is the bottom left entry in the 2x2 matrix + ! ur is the upper right entry in the 2x2 matrix + + ! Start + ! At the start we have the 'bl', and assume 'ur' + bl = dist_sa(1) + ur = dist_sa(1) + recp = 1._dp/(1._dp - ur*bl) + du_s = recp*(du(i, 1, b) - bl*recv_u_s(i, 1, b)) + + ! End + ! At the end we have the 'ur', and assume 'bl' + bl = dist_sc(n) + ur = dist_sc(n) + recp = 1._dp/(1._dp - ur*bl) + du_e = recp*(du(i, n, b) - ur*recv_u_e(i, 1, b)) + + du(i, 1, b) = du_s + do j = 2, n - 1 + du(i, j, b) = (du(i, j, b) - dist_sa(j)*du_s - dist_sc(j)*du_e) + end do + du(i, n, b) = du_e + +end subroutine der_univ_subs + +attributes(global) subroutine transeq_3fused_dist( & + du, dud, d2u, & + send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e, & + u, u_s, u_e, v, v_s, v_e, n, & + d1_coeffs_s, d1_coeffs_e, d1_coeffs, d1_fw, d1_bw, d1_af, & + d2_coeffs_s, d2_coeffs_e, d2_coeffs, d2_fw, d2_bw, d2_af & + ) +implicit none + + ! Arguments +real(dp), device, intent(out), dimension(:, :, :) :: du, dud, d2u +real(dp), device, intent(out), dimension(:, :, :) :: & + send_du_s, send_du_e, send_dud_s, send_dud_e, send_d2u_s, send_d2u_e +real(dp), device, intent(in), dimension(:, :, :) :: u, u_s, u_e, & + v, v_s, v_e +integer, value, intent(in) :: n +real(dp), device, intent(in) :: d1_coeffs_s(:, :), d1_coeffs_e(:, :), & + d1_coeffs(:) +real(dp), device, intent(in) :: d1_fw(:), d1_bw(:), d1_af(:) +real(dp), device, intent(in) :: d2_coeffs_s(:, :), d2_coeffs_e(:, :), & + d2_coeffs(:) +real(dp), device, intent(in) :: d2_fw(:), d2_bw(:), d2_af(:) + + ! Local variables +integer :: i, j, b + +real(dp) :: d1_c_m4, d1_c_m3, d1_c_m2, d1_c_m1, d1_c_j, & + d1_c_p1, d1_c_p2, d1_c_p3, d1_c_p4, & + d1_alpha, d1_last_r +real(dp) :: d2_c_m4, d2_c_m3, d2_c_m2, d2_c_m1, d2_c_j, & + d2_c_p1, d2_c_p2, d2_c_p3, d2_c_p4, & + d2_alpha, d2_last_r +real(dp) :: temp_du, temp_dud, temp_d2u +real(dp) :: u_m4, u_m3, u_m2, u_m1, u_j, u_p1, u_p2, u_p3, u_p4 +real(dp) :: v_m4, v_m3, v_m2, v_m1, v_j, v_p1, v_p2, v_p3, v_p4 +real(dp) :: old_du, old_dud, old_d2u + +i = threadIdx%x +b = blockIdx%x + +d1_last_r = d1_fw(1) +d2_last_r = d2_fw(1) + + ! j = 1 +temp_du = d1_coeffs_s(1, 1)*u_s(i, 1, b) & + + d1_coeffs_s(2, 1)*u_s(i, 2, b) & + + d1_coeffs_s(3, 1)*u_s(i, 3, b) & + + d1_coeffs_s(4, 1)*u_s(i, 4, b) & + + d1_coeffs_s(5, 1)*u(i, 1, b) & + + d1_coeffs_s(6, 1)*u(i, 2, b) & + + d1_coeffs_s(7, 1)*u(i, 3, b) & + + d1_coeffs_s(8, 1)*u(i, 4, b) & + + d1_coeffs_s(9, 1)*u(i, 5, b) +du(i, 1, b) = temp_du*d1_af(1) +temp_dud = d1_coeffs_s(1, 1)*u_s(i, 1, b)*v_s(i, 1, b) & + + d1_coeffs_s(2, 1)*u_s(i, 2, b)*v_s(i, 2, b) & + + d1_coeffs_s(3, 1)*u_s(i, 3, b)*v_s(i, 3, b) & + + d1_coeffs_s(4, 1)*u_s(i, 4, b)*v_s(i, 4, b) & + + d1_coeffs_s(5, 1)*u(i, 1, b)*v(i, 1, b) & + + d1_coeffs_s(6, 1)*u(i, 2, b)*v(i, 2, b) & + + d1_coeffs_s(7, 1)*u(i, 3, b)*v(i, 3, b) & + + d1_coeffs_s(8, 1)*u(i, 4, b)*v(i, 4, b) & + + d1_coeffs_s(9, 1)*u(i, 5, b)*v(i, 5, b) +dud(i, 1, b) = temp_dud*d1_af(1) +temp_d2u = d2_coeffs_s(1, 1)*u_s(i, 1, b) & + + d2_coeffs_s(2, 1)*u_s(i, 2, b) & + + d2_coeffs_s(3, 1)*u_s(i, 3, b) & + + d2_coeffs_s(4, 1)*u_s(i, 4, b) & + + d2_coeffs_s(5, 1)*u(i, 1, b) & + + d2_coeffs_s(6, 1)*u(i, 2, b) & + + d2_coeffs_s(7, 1)*u(i, 3, b) & + + d2_coeffs_s(8, 1)*u(i, 4, b) & + + d2_coeffs_s(9, 1)*u(i, 5, b) +d2u(i, 1, b) = temp_d2u*d2_af(1) + ! j = 2 +temp_du = d1_coeffs_s(1, 2)*u_s(i, 2, b) & + + d1_coeffs_s(2, 2)*u_s(i, 3, b) & + + d1_coeffs_s(3, 2)*u_s(i, 4, b) & + + d1_coeffs_s(4, 2)*u(i, 1, b) & + + d1_coeffs_s(5, 2)*u(i, 2, b) & + + d1_coeffs_s(6, 2)*u(i, 3, b) & + + d1_coeffs_s(7, 2)*u(i, 4, b) & + + d1_coeffs_s(8, 2)*u(i, 5, b) & + + d1_coeffs_s(9, 2)*u(i, 6, b) +du(i, 2, b) = temp_du*d1_af(2) +temp_dud = d1_coeffs_s(1, 2)*u_s(i, 2, b)*v_s(i, 2, b) & + + d1_coeffs_s(2, 2)*u_s(i, 3, b)*v_s(i, 3, b) & + + d1_coeffs_s(3, 2)*u_s(i, 4, b)*v_s(i, 4, b) & + + d1_coeffs_s(4, 2)*u(i, 1, b)*v(i, 1, b) & + + d1_coeffs_s(5, 2)*u(i, 2, b)*v(i, 2, b) & + + d1_coeffs_s(6, 2)*u(i, 3, b)*v(i, 3, b) & + + d1_coeffs_s(7, 2)*u(i, 4, b)*v(i, 4, b) & + + d1_coeffs_s(8, 2)*u(i, 5, b)*v(i, 5, b) & + + d1_coeffs_s(9, 2)*u(i, 6, b)*v(i, 6, b) +dud(i, 2, b) = temp_dud*d1_af(2) +temp_d2u = d2_coeffs_s(1, 2)*u_s(i, 2, b) & + + d2_coeffs_s(2, 2)*u_s(i, 3, b) & + + d2_coeffs_s(3, 2)*u_s(i, 4, b) & + + d2_coeffs_s(4, 2)*u(i, 1, b) & + + d2_coeffs_s(5, 2)*u(i, 2, b) & + + d2_coeffs_s(6, 2)*u(i, 3, b) & + + d2_coeffs_s(7, 2)*u(i, 4, b) & + + d2_coeffs_s(8, 2)*u(i, 5, b) & + + d2_coeffs_s(9, 2)*u(i, 6, b) +d2u(i, 2, b) = temp_d2u*d2_af(2) + ! j = 3 +temp_du = d1_coeffs_s(1, 3)*u_s(i, 3, b) & + + d1_coeffs_s(2, 3)*u_s(i, 4, b) & + + d1_coeffs_s(3, 3)*u(i, 1, b) & + + d1_coeffs_s(4, 3)*u(i, 2, b) & + + d1_coeffs_s(5, 3)*u(i, 3, b) & + + d1_coeffs_s(6, 3)*u(i, 4, b) & + + d1_coeffs_s(7, 3)*u(i, 5, b) & + + d1_coeffs_s(8, 3)*u(i, 6, b) & + + d1_coeffs_s(9, 3)*u(i, 7, b) +du(i, 3, b) = d1_fw(3)*(temp_du - d1_af(3)*du(i, 2, b)) +temp_dud = d1_coeffs_s(1, 3)*u_s(i, 3, b)*v_s(i, 3, b) & + + d1_coeffs_s(2, 3)*u_s(i, 4, b)*v_s(i, 4, b) & + + d1_coeffs_s(3, 3)*u(i, 1, b)*v(i, 1, b) & + + d1_coeffs_s(4, 3)*u(i, 2, b)*v(i, 2, b) & + + d1_coeffs_s(5, 3)*u(i, 3, b)*v(i, 3, b) & + + d1_coeffs_s(6, 3)*u(i, 4, b)*v(i, 4, b) & + + d1_coeffs_s(7, 3)*u(i, 5, b)*v(i, 5, b) & + + d1_coeffs_s(8, 3)*u(i, 6, b)*v(i, 6, b) & + + d1_coeffs_s(9, 3)*u(i, 7, b)*v(i, 7, b) +dud(i, 3, b) = d1_fw(3)*(temp_dud - d1_af(3)*dud(i, 2, b)) +temp_d2u = d2_coeffs_s(1, 3)*u_s(i, 3, b) & + + d2_coeffs_s(2, 3)*u_s(i, 4, b) & + + d2_coeffs_s(3, 3)*u(i, 1, b) & + + d2_coeffs_s(4, 3)*u(i, 2, b) & + + d2_coeffs_s(5, 3)*u(i, 3, b) & + + d2_coeffs_s(6, 3)*u(i, 4, b) & + + d2_coeffs_s(7, 3)*u(i, 5, b) & + + d2_coeffs_s(8, 3)*u(i, 6, b) & + + d2_coeffs_s(9, 3)*u(i, 7, b) +d2u(i, 3, b) = d2_fw(3)*(temp_d2u - d2_af(3)*d2u(i, 2, b)) + ! j = 4 +temp_du = d1_coeffs_s(1, 4)*u_s(i, 4, b) & + + d1_coeffs_s(2, 4)*u(i, 1, b) & + + d1_coeffs_s(3, 4)*u(i, 2, b) & + + d1_coeffs_s(4, 4)*u(i, 3, b) & + + d1_coeffs_s(5, 4)*u(i, 4, b) & + + d1_coeffs_s(6, 4)*u(i, 5, b) & + + d1_coeffs_s(7, 4)*u(i, 6, b) & + + d1_coeffs_s(8, 4)*u(i, 7, b) & + + d1_coeffs_s(9, 4)*u(i, 8, b) +du(i, 4, b) = d1_fw(4)*(temp_du - d1_af(3)*du(i, 3, b)) +temp_dud = d1_coeffs_s(1, 4)*u_s(i, 4, b)*v_s(i, 4, b) & + + d1_coeffs_s(2, 4)*u(i, 1, b)*v(i, 1, b) & + + d1_coeffs_s(3, 4)*u(i, 2, b)*v(i, 2, b) & + + d1_coeffs_s(4, 4)*u(i, 3, b)*v(i, 3, b) & + + d1_coeffs_s(5, 4)*u(i, 4, b)*v(i, 4, b) & + + d1_coeffs_s(6, 4)*u(i, 5, b)*v(i, 5, b) & + + d1_coeffs_s(7, 4)*u(i, 6, b)*v(i, 6, b) & + + d1_coeffs_s(8, 4)*u(i, 7, b)*v(i, 7, b) & + + d1_coeffs_s(9, 4)*u(i, 8, b)*v(i, 8, b) +dud(i, 4, b) = d1_fw(4)*(temp_dud - d1_af(3)*dud(i, 3, b)) +temp_d2u = d2_coeffs_s(1, 4)*u_s(i, 4, b) & + + d2_coeffs_s(2, 4)*u(i, 1, b) & + + d2_coeffs_s(3, 4)*u(i, 2, b) & + + d2_coeffs_s(4, 4)*u(i, 3, b) & + + d2_coeffs_s(5, 4)*u(i, 4, b) & + + d2_coeffs_s(6, 4)*u(i, 5, b) & + + d2_coeffs_s(7, 4)*u(i, 6, b) & + + d2_coeffs_s(8, 4)*u(i, 7, b) & + + d2_coeffs_s(9, 4)*u(i, 8, b) +d2u(i, 4, b) = d2_fw(4)*(temp_d2u - d2_af(3)*d2u(i, 3, b)) + +d1_alpha = d1_af(5) +d2_alpha = d2_af(5) + + ! store bulk coeffs in the registers +d1_c_m4 = d1_coeffs(1); d1_c_m3 = d1_coeffs(2) +d1_c_m2 = d1_coeffs(3); d1_c_m1 = d1_coeffs(4) +d1_c_j = d1_coeffs(5) +d1_c_p1 = d1_coeffs(6); d1_c_p2 = d1_coeffs(7) +d1_c_p3 = d1_coeffs(8); d1_c_p4 = d1_coeffs(9) + +d2_c_m4 = d2_coeffs(1); d2_c_m3 = d2_coeffs(2) +d2_c_m2 = d2_coeffs(3); d2_c_m1 = d2_coeffs(4) +d2_c_j = d2_coeffs(5) +d2_c_p1 = d2_coeffs(6); d2_c_p2 = d2_coeffs(7) +d2_c_p3 = d2_coeffs(8); d2_c_p4 = d2_coeffs(9) + + ! It is better to access d?(i, j - 1, b) via old_d? +old_du = du(i, 4, b) +old_dud = dud(i, 4, b) +old_d2u = d2u(i, 4, b) + + ! Populate registers with the u and v stencils +u_m4 = u(i, 1, b); u_m3 = u(i, 2, b) +u_m2 = u(i, 3, b); u_m1 = u(i, 4, b) +u_j = u(i, 5, b); u_p1 = u(i, 6, b) +u_p2 = u(i, 7, b); u_p3 = u(i, 8, b) +v_m4 = v(i, 1, b); v_m3 = v(i, 2, b) +v_m2 = v(i, 3, b); v_m1 = v(i, 4, b) +v_j = v(i, 5, b); v_p1 = v(i, 6, b) +v_p2 = v(i, 7, b); v_p3 = v(i, 8, b) + +do j = 5, n - 4 + u_p4 = u(i, j+4, b); v_p4 = v(i, j+4, b) + + ! du + temp_du = d1_c_m4*u_m4 + d1_c_m3*u_m3 + d1_c_m2*u_m2 + d1_c_m1*u_m1 & + + d1_c_j*u_j & + + d1_c_p1*u_p1 + d1_c_p2*u_p2 + d1_c_p3*u_p3 + d1_c_p4*u_p4 + du(i, j, b) = d1_fw(j)*(temp_du - d1_alpha*old_du) + old_du = du(i, j, b) + + ! dud + temp_dud = d1_c_m4*u_m4*v_m4 + d1_c_m3*u_m3*v_m3 & + + d1_c_m2*u_m2*v_m2 + d1_c_m1*u_m1*v_m1 & + + d1_c_j*u_j*v_j & + + d1_c_p1*u_p1*v_p1 + d1_c_p2*u_p2*v_p2 & + + d1_c_p3*u_p3*v_p3 + d1_c_p4*u_p4*v_p4 + dud(i, j, b) = d1_fw(j)*(temp_dud - d1_alpha*old_dud) + old_dud = dud(i, j, b) + + ! d2u + temp_d2u = d2_c_m4*u_m4 + d2_c_m3*u_m3 + d2_c_m2*u_m2 + d2_c_m1*u_m1 & + + d2_c_j*u_j & + + d2_c_p1*u_p1 + d2_c_p2*u_p2 + d2_c_p3*u_p3 + d2_c_p4*u_p4 + d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_alpha*old_d2u) + old_d2u = d2u(i, j, b) + + ! Prepare registers for the next step + u_m4 = u_m3; u_m3 = u_m2; u_m2 = u_m1; u_m1 = u_j + u_j = u_p1; u_p1 = u_p2; u_p2 = u_p3; u_p3 = u_p4 + v_m4 = v_m3; v_m3 = v_m2; v_m2 = v_m1; v_m1 = v_j + v_j = v_p1; v_p1 = v_p2; v_p2 = v_p3; v_p3 = v_p4 +end do + +j = n - 3 +temp_du = d1_coeffs_e(1, 1)*u(i, j - 4, b) & + + d1_coeffs_e(2, 1)*u(i, j - 3, b) & + + d1_coeffs_e(3, 1)*u(i, j - 2, b) & + + d1_coeffs_e(4, 1)*u(i, j - 1, b) & + + d1_coeffs_e(5, 1)*u(i, j, b) & + + d1_coeffs_e(6, 1)*u(i, j + 1, b) & + + d1_coeffs_e(7, 1)*u(i, j + 2, b) & + + d1_coeffs_e(8, 1)*u(i, j + 3, b) & + + d1_coeffs_e(9, 1)*u_e(i, 1, b) +du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) +temp_dud = d1_coeffs_e(1, 1)*u(i, j - 4, b)*v(i, j - 4, b) & + + d1_coeffs_e(2, 1)*u(i, j - 3, b)*v(i, j - 3, b) & + + d1_coeffs_e(3, 1)*u(i, j - 2, b)*v(i, j - 2, b) & + + d1_coeffs_e(4, 1)*u(i, j - 1, b)*v(i, j - 1, b) & + + d1_coeffs_e(5, 1)*u(i, j, b)*v(i, j, b) & + + d1_coeffs_e(6, 1)*u(i, j + 1, b)*v(i, j + 1, b) & + + d1_coeffs_e(7, 1)*u(i, j + 2, b)*v(i, j + 2, b) & + + d1_coeffs_e(8, 1)*u(i, j + 3, b)*v(i, j + 3, b) & + + d1_coeffs_e(9, 1)*u_e(i, 1, b)*v_e(i, 1, b) +dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) +temp_d2u = d1_coeffs_e(1, 1)*u(i, j - 4, b) & + + d2_coeffs_e(2, 1)*u(i, j - 3, b) & + + d2_coeffs_e(3, 1)*u(i, j - 2, b) & + + d2_coeffs_e(4, 1)*u(i, j - 1, b) & + + d2_coeffs_e(5, 1)*u(i, j, b) & + + d2_coeffs_e(6, 1)*u(i, j + 1, b) & + + d2_coeffs_e(7, 1)*u(i, j + 2, b) & + + d2_coeffs_e(8, 1)*u(i, j + 3, b) & + + d2_coeffs_e(9, 1)*u_e(i, 1, b) +d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) +j = n - 2 +temp_du = d1_coeffs_e(1, 2)*u(i, j - 4, b) & + + d1_coeffs_e(2, 2)*u(i, j - 3, b) & + + d1_coeffs_e(3, 2)*u(i, j - 2, b) & + + d1_coeffs_e(4, 2)*u(i, j - 1, b) & + + d1_coeffs_e(5, 2)*u(i, j, b) & + + d1_coeffs_e(6, 2)*u(i, j + 1, b) & + + d1_coeffs_e(7, 2)*u(i, j + 2, b) & + + d1_coeffs_e(8, 2)*u_e(i, 1, b) & + + d1_coeffs_e(9, 2)*u_e(i, 2, b) +du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) +temp_dud = d1_coeffs_e(1, 2)*u(i, j - 4, b)*v(i, j - 4, b) & + + d1_coeffs_e(2, 2)*u(i, j - 3, b)*v(i, j - 3, b) & + + d1_coeffs_e(3, 2)*u(i, j - 2, b)*v(i, j - 2, b) & + + d1_coeffs_e(4, 2)*u(i, j - 1, b)*v(i, j - 1, b) & + + d1_coeffs_e(5, 2)*u(i, j, b)*v(i, j, b) & + + d1_coeffs_e(6, 2)*u(i, j + 1, b)*v(i, j + 1, b) & + + d1_coeffs_e(7, 2)*u(i, j + 2, b)*v(i, j + 2, b) & + + d1_coeffs_e(8, 2)*u_e(i, 1, b)*v_e(i, 1, b) & + + d1_coeffs_e(9, 2)*u_e(i, 2, b)*v_e(i, 2, b) +dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) +temp_d2u = d2_coeffs_e(1, 2)*u(i, j - 4, b) & + + d2_coeffs_e(2, 2)*u(i, j - 3, b) & + + d2_coeffs_e(3, 2)*u(i, j - 2, b) & + + d2_coeffs_e(4, 2)*u(i, j - 1, b) & + + d2_coeffs_e(5, 2)*u(i, j, b) & + + d2_coeffs_e(6, 2)*u(i, j + 1, b) & + + d2_coeffs_e(7, 2)*u(i, j + 2, b) & + + d2_coeffs_e(8, 2)*u_e(i, 1, b) & + + d2_coeffs_e(9, 2)*u_e(i, 2, b) +d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) +j = n - 1 +temp_du = d1_coeffs_e(1, 3)*u(i, j - 4, b) & + + d1_coeffs_e(2, 3)*u(i, j - 3, b) & + + d1_coeffs_e(3, 3)*u(i, j - 2, b) & + + d1_coeffs_e(4, 3)*u(i, j - 1, b) & + + d1_coeffs_e(5, 3)*u(i, j, b) & + + d1_coeffs_e(6, 3)*u(i, j + 1, b) & + + d1_coeffs_e(7, 3)*u_e(i, 1, b) & + + d1_coeffs_e(8, 3)*u_e(i, 2, b) & + + d1_coeffs_e(9, 3)*u_e(i, 3, b) +du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) +temp_dud = d1_coeffs_e(1, 3)*u(i, j - 4, b)*v(i, j - 4, b) & + + d1_coeffs_e(2, 3)*u(i, j - 3, b)*v(i, j - 3, b) & + + d1_coeffs_e(3, 3)*u(i, j - 2, b)*v(i, j - 2, b) & + + d1_coeffs_e(4, 3)*u(i, j - 1, b)*v(i, j - 1, b) & + + d1_coeffs_e(5, 3)*u(i, j, b)*v(i, j, b) & + + d1_coeffs_e(6, 3)*u(i, j + 1, b)*v(i, j + 1, b) & + + d1_coeffs_e(7, 3)*u_e(i, 1, b)*v_e(i, 1, b) & + + d1_coeffs_e(8, 3)*u_e(i, 2, b)*v_e(i, 2, b) & + + d1_coeffs_e(9, 3)*u_e(i, 3, b)*v_e(i, 3, b) +dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) +temp_d2u = d2_coeffs_e(1, 3)*u(i, j - 4, b) & + + d2_coeffs_e(2, 3)*u(i, j - 3, b) & + + d2_coeffs_e(3, 3)*u(i, j - 2, b) & + + d2_coeffs_e(4, 3)*u(i, j - 1, b) & + + d2_coeffs_e(5, 3)*u(i, j, b) & + + d2_coeffs_e(6, 3)*u(i, j + 1, b) & + + d2_coeffs_e(7, 3)*u_e(i, 1, b) & + + d2_coeffs_e(8, 3)*u_e(i, 2, b) & + + d2_coeffs_e(9, 3)*u_e(i, 3, b) +d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) +j = n +temp_du = d1_coeffs_e(1, 4)*u(i, j - 4, b) & + + d1_coeffs_e(2, 4)*u(i, j - 3, b) & + + d1_coeffs_e(3, 4)*u(i, j - 2, b) & + + d1_coeffs_e(4, 4)*u(i, j - 1, b) & + + d1_coeffs_e(5, 4)*u(i, j, b) & + + d1_coeffs_e(6, 4)*u_e(i, 1, b) & + + d1_coeffs_e(7, 4)*u_e(i, 2, b) & + + d1_coeffs_e(8, 4)*u_e(i, 3, b) & + + d1_coeffs_e(9, 4)*u_e(i, 4, b) +du(i, j, b) = d1_fw(j)*(temp_du - d1_af(j)*du(i, j - 1, b)) +temp_dud = d1_coeffs_e(1, 4)*u(i, j - 4, b)*v(i, j - 4, b) & + + d1_coeffs_e(2, 4)*u(i, j - 3, b)*v(i, j - 3, b) & + + d1_coeffs_e(3, 4)*u(i, j - 2, b)*v(i, j - 2, b) & + + d1_coeffs_e(4, 4)*u(i, j - 1, b)*v(i, j - 1, b) & + + d1_coeffs_e(5, 4)*u(i, j, b)*v(i, j, b) & + + d1_coeffs_e(6, 4)*u_e(i, 1, b)*v_e(i, 1, b) & + + d1_coeffs_e(7, 4)*u_e(i, 2, b)*v_e(i, 2, b) & + + d1_coeffs_e(8, 4)*u_e(i, 3, b)*v_e(i, 3, b) & + + d1_coeffs_e(9, 4)*u_e(i, 4, b)*v_e(i, 4, b) +dud(i, j, b) = d1_fw(j)*(temp_dud - d1_af(j)*dud(i, j - 1, b)) +temp_d2u = d2_coeffs_e(1, 4)*u(i, j - 4, b) & + + d2_coeffs_e(2, 4)*u(i, j - 3, b) & + + d2_coeffs_e(3, 4)*u(i, j - 2, b) & + + d2_coeffs_e(4, 4)*u(i, j - 1, b) & + + d2_coeffs_e(5, 4)*u(i, j, b) & + + d2_coeffs_e(6, 4)*u_e(i, 1, b) & + + d2_coeffs_e(7, 4)*u_e(i, 2, b) & + + d2_coeffs_e(8, 4)*u_e(i, 3, b) & + + d2_coeffs_e(9, 4)*u_e(i, 4, b) +d2u(i, j, b) = d2_fw(j)*(temp_d2u - d2_af(j)*d2u(i, j - 1, b)) + +send_du_e(i, 1, b) = du(i, n, b) +send_dud_e(i, 1, b) = dud(i, n, b) +send_d2u_e(i, 1, b) = d2u(i, n, b) + + ! Backward pass of the hybrid algorithm +do j = n - 2, 2, -1 + du(i, j, b) = du(i, j, b) - d1_bw(j)*du(i, j + 1, b) + dud(i, j, b) = dud(i, j, b) - d1_bw(j)*dud(i, j + 1, b) + d2u(i, j, b) = d2u(i, j, b) - d2_bw(j)*d2u(i, j + 1, b) +end do +du(i, 1, b) = d1_last_r*(du(i, 1, b) - d1_bw(1)*du(i, 2, b)) +dud(i, 1, b) = d1_last_r*(dud(i, 1, b) - d1_bw(1)*dud(i, 2, b)) +d2u(i, 1, b) = d2_last_r*(d2u(i, 1, b) - d2_bw(1)*d2u(i, 2, b)) + +send_du_s(i, 1, b) = du(i, 1, b) +send_dud_s(i, 1, b) = dud(i, 1, b) +send_d2u_s(i, 1, b) = d2u(i, 1, b) + +end subroutine transeq_3fused_dist + +attributes(global) subroutine transeq_3fused_subs( & + r_u, conv, du, dud, d2u, & + recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e, & + d1_sa, d1_sc, d2_sa, d2_sc, n, nu & + ) +implicit none + + ! Arguments +real(dp), device, intent(out), dimension(:, :, :) :: r_u +real(dp), device, intent(in), dimension(:, :, :) :: conv, du, dud, d2u +real(dp), device, intent(in), dimension(:, :, :) :: & + recv_du_s, recv_du_e, recv_dud_s, recv_dud_e, recv_d2u_s, recv_d2u_e +real(dp), device, intent(in), dimension(:) :: d1_sa, d1_sc, d2_sa, d2_sc +integer, value, intent(in) :: n +real(dp), value, intent(in) :: nu + + ! Local variables +integer :: i, j, b +real(dp) :: ur, bl, recp +real(dp) :: du_temp, dud_temp, d2u_temp +real(dp) :: du_s, du_e, dud_s, dud_e, d2u_s, d2u_e + +i = threadIdx%x +b = blockIdx%x + + ! A small trick we do here is valid for symmetric Toeplitz matrices. + ! In our case our matrices satisfy this criteria in the (5:n-4) region + ! and as long as a rank has around at least 20 entries the assumptions + ! we make here are perfectly valid. + + ! bl is the bottom left entry in the 2x2 matrix + ! ur is the upper right entry in the 2x2 matrix + + ! Start + ! At the start we have the 'bl', and assume 'ur' + ! first derivative +bl = d1_sa(1) +ur = d1_sa(1) +recp = 1._dp/(1._dp - ur*bl) + +du_s = recp*(du(i, 1, b) - bl*recv_du_s(i, 1, b)) +dud_s = recp*(dud(i, 1, b) - bl*recv_dud_s(i, 1, b)) + + ! second derivative +bl = d2_sa(1) +ur = d2_sa(1) +recp = 1._dp/(1._dp - ur*bl) + +d2u_s = recp*(d2u(i, 1, b) - bl*recv_d2u_s(i, 1, b)) + + ! End + ! At the end we have the 'ur', and assume 'bl' + ! first derivative +bl = d1_sc(n) +ur = d1_sc(n) +recp = 1._dp/(1._dp - ur*bl) + +du_e = recp*(du(i, n, b) - ur*recv_du_e(i, 1, b)) +dud_e = recp*(dud(i, n, b) - ur*recv_dud_e(i, 1, b)) + + ! second derivative +bl = d2_sc(n) +ur = d2_sc(n) +recp = 1._dp/(1._dp - ur*bl) + +d2u_e = recp*(d2u(i, n, b) - ur*recv_d2u_e(i, 1, b)) + + ! final substitution +r_u(i, 1, b) = -0.5_dp*(conv(i, 1, b)*du_s + dud_s) + nu*d2u_s +do j = 2, n - 1 + du_temp = (du(i, j, b) - d1_sa(j)*du_s - d1_sc(j)*du_e) + dud_temp = (dud(i, j, b) - d1_sa(j)*dud_s - d1_sc(j)*dud_e) + d2u_temp = (d2u(i, j, b) - d2_sa(j)*d2u_s - d2_sc(j)*d2u_e) + r_u(i, j, b) = -0.5_dp*(conv(i, j, b)*du_temp + dud_temp) & + + nu*d2u_temp +end do +r_u(i, n, b) = -0.5_dp*(conv(i, n, b)*du_e + dud_e) + nu*d2u_e + +end subroutine transeq_3fused_subs end module m_cuda_kernels_dist diff --git a/src/cuda/kernels/reorder.f90 b/src/cuda/kernels/reorder.f90 index 52ccd0e5..70b482b6 100644 --- a/src/cuda/kernels/reorder.f90 +++ b/src/cuda/kernels/reorder.f90 @@ -1,288 +1,288 @@ -module m_cuda_kernels_reorder - use cudafor + module m_cuda_kernels_reorder + use cudafor - use m_common, only: dp - use m_cuda_common, only: SZ + use m_common, only: dp + use m_cuda_common, only: SZ -contains + contains - attributes(global) subroutine reorder_c2x(u_x, u_c, nz) - implicit none + attributes(global) subroutine reorder_c2x(u_x, u_c, nz) + implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_c - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_x + real(dp), device, intent(in), dimension(:, :, :) :: u_c + integer, value, intent(in) :: nz - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) + ! copy into shared + tile(i, j) = u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) - call syncthreads() + call syncthreads() - ! copy into output array from shared - u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) = tile(j, i) + ! copy into output array from shared + u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) = tile(j, i) - end subroutine reorder_c2x + end subroutine reorder_c2x - attributes(global) subroutine reorder_x2c(u_c, u_x, nz) - implicit none + attributes(global) subroutine reorder_x2c(u_c, u_x, nz) + implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_c - real(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz + real(dp), device, intent(out), dimension(:, :, :) :: u_c + real(dp), device, intent(in), dimension(:, :, :) :: u_x + integer, value, intent(in) :: nz - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k + real(dp), shared :: tile(SZ, SZ) + integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z + i = threadIdx%x; j = threadIdx%y + b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) + ! copy into shared + tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_k + (b_j - 1)*nz) - call syncthreads() + call syncthreads() - ! copy into output array from shared - u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) = tile(j, i) + ! copy into output array from shared + u_c(i + (b_i - 1)*SZ, j + (b_j - 1)*SZ, b_k) = tile(j, i) - end subroutine reorder_x2c +end subroutine reorder_x2c - attributes(global) subroutine reorder_x2y(u_y, u_x, nz) - implicit none +attributes(global) subroutine reorder_x2y(u_y, u_x, nz) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_y - real(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz +real(dp), device, intent(out), dimension(:, :, :) :: u_y +real(dp), device, intent(in), dimension(:, :, :) :: u_x +integer, value, intent(in) :: nz - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k +real(dp), shared :: tile(SZ, SZ) +integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz) + ! copy into shared +tile(i, j) = u_x(i, j + (b_i - 1)*SZ, b_j + (b_k - 1)*nz) - call syncthreads() +call syncthreads() - ! copy into output array from shared - u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i) + ! copy into output array from shared +u_y(i, j + (b_k - 1)*SZ, b_j + (b_i - 1)*nz) = tile(j, i) - end subroutine reorder_x2y +end subroutine reorder_x2y - attributes(global) subroutine reorder_x2z(u_z, u_x, nz) - implicit none +attributes(global) subroutine reorder_x2z(u_z, u_x, nz) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_z - real(dp), device, intent(in), dimension(:, :, :) :: u_x - integer, value, intent(in) :: nz +real(dp), device, intent(out), dimension(:, :, :) :: u_z +real(dp), device, intent(in), dimension(:, :, :) :: u_x +integer, value, intent(in) :: nz - integer :: i, j, b_i, b_j, nx +integer :: i, j, b_i, b_j, nx - i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y - nx = gridDim%x +i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y +nx = gridDim%x - ! Data access pattern for reordering between x and z is quite nice - ! thus we don't need to use shared memory for this operation. - do j = 1, nz - u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz) - end do + ! Data access pattern for reordering between x and z is quite nice + ! thus we don't need to use shared memory for this operation. +do j = 1, nz + u_z(i, j, b_i + (b_j - 1)*nx) = u_x(i, b_i, j + (b_j - 1)*nz) +end do - end subroutine reorder_x2z +end subroutine reorder_x2z - attributes(global) subroutine reorder_y2x(u_x, u_y, nz) - implicit none +attributes(global) subroutine reorder_y2x(u_x, u_y, nz) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nz +real(dp), device, intent(out), dimension(:, :, :) :: u_x +real(dp), device, intent(in), dimension(:, :, :) :: u_y +integer, value, intent(in) :: nz - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k +real(dp), shared :: tile(SZ, SZ) +integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) + ! copy into shared +tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) - call syncthreads() +call syncthreads() - ! copy into output array from shared - u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i) + ! copy into output array from shared +u_x(i, (b_i - 1)*SZ + j, (b_j - 1)*nz + b_k) = tile(j, i) - end subroutine reorder_y2x +end subroutine reorder_y2x - attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz) - implicit none +attributes(global) subroutine reorder_y2z(u_z, u_y, nx, nz) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_z - real(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nx, nz +real(dp), device, intent(out), dimension(:, :, :) :: u_z +real(dp), device, intent(in), dimension(:, :, :) :: u_y +integer, value, intent(in) :: nx, nz - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k +real(dp), shared :: tile(SZ, SZ) +integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) + ! copy into shared +tile(i, j) = u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) - call syncthreads() +call syncthreads() - ! copy into output array from shared - u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i) + ! copy into output array from shared +u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) = tile(j, i) - end subroutine reorder_y2z +end subroutine reorder_y2z - attributes(global) subroutine reorder_z2x(u_x, u_z, nz) - implicit none +attributes(global) subroutine reorder_z2x(u_x, u_z, nz) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nz +real(dp), device, intent(out), dimension(:, :, :) :: u_x +real(dp), device, intent(in), dimension(:, :, :) :: u_z +integer, value, intent(in) :: nz - integer :: i, j, b_i, b_j, nx +integer :: i, j, b_i, b_j, nx - i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y - nx = gridDim%x +i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y +nx = gridDim%x - do j = 1, nz - u_x(i, b_i, j + (b_j - 1)*nz) = u_z(i, j, b_i + (b_j - 1)*nx) - end do +do j = 1, nz + u_x(i, b_i, j + (b_j - 1)*nz) = u_z(i, j, b_i + (b_j - 1)*nx) +end do - end subroutine reorder_z2x +end subroutine reorder_z2x - attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz) - implicit none +attributes(global) subroutine reorder_z2y(u_y, u_z, nx, nz) +implicit none - real(dp), device, intent(out), dimension(:, :, :) :: u_y - real(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nx, nz +real(dp), device, intent(out), dimension(:, :, :) :: u_y +real(dp), device, intent(in), dimension(:, :, :) :: u_z +integer, value, intent(in) :: nx, nz - real(dp), shared :: tile(SZ, SZ) - integer :: i, j, b_i, b_j, b_k +real(dp), shared :: tile(SZ, SZ) +integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) + ! copy into shared +tile(i, j) = u_z(i, b_k, (b_i - 1)*SZ + j + (b_j - 1)*nx) - call syncthreads() +call syncthreads() - ! copy into output array from shared - u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i) + ! copy into output array from shared +u_y(i, (b_j - 1)*SZ + j, (b_i - 1)*nz + b_k) = tile(j, i) - end subroutine reorder_z2y +end subroutine reorder_z2y - attributes(global) subroutine sum_yintox(u_x, u_y, nz) - implicit none +attributes(global) subroutine sum_yintox(u_x, u_y, nz) +implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_y - integer, value, intent(in) :: nz +real(dp), device, intent(inout), dimension(:, :, :) :: u_x +real(dp), device, intent(in), dimension(:, :, :) :: u_y +integer, value, intent(in) :: nz - real(dp), shared :: tile(SZ,SZ) - integer :: i, j, b_i, b_j, b_k +real(dp), shared :: tile(SZ,SZ) +integer :: i, j, b_i, b_j, b_k - i = threadIdx%x; j = threadIdx%y - b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z +i = threadIdx%x; j = threadIdx%y +b_i = blockIdx%x; b_j = blockIdx%y; b_k = blockIdx%z - ! copy into shared - tile(i, j) = u_y(i, (b_j-1)*SZ+j, (b_k)+nz*(b_i-1)) + ! copy into shared +tile(i, j) = u_y(i, (b_j-1)*SZ+j, (b_k)+nz*(b_i-1)) - call syncthreads() +call syncthreads() - ! copy into output array from shared - u_x(i, (b_i-1)*SZ+j, (b_j-1)*nz+(b_k)) = & - u_x(i, (b_i-1)*SZ+j, (b_j-1)*nz+(b_k)) + tile(j, i) + ! copy into output array from shared +u_x(i, (b_i-1)*SZ+j, (b_j-1)*nz+(b_k)) = & + u_x(i, (b_i-1)*SZ+j, (b_j-1)*nz+(b_k)) + tile(j, i) - end subroutine sum_yintox +end subroutine sum_yintox - attributes(global) subroutine sum_zintox(u_x, u_z, nz) - implicit none +attributes(global) subroutine sum_zintox(u_x, u_z, nz) +implicit none - ! Arguments - real(dp), device, intent(inout), dimension(:, :, :) :: u_x - real(dp), device, intent(in), dimension(:, :, :) :: u_z - integer, value, intent(in) :: nz + ! Arguments +real(dp), device, intent(inout), dimension(:, :, :) :: u_x +real(dp), device, intent(in), dimension(:, :, :) :: u_z +integer, value, intent(in) :: nz - integer :: i, j, b_i, b_j, nx +integer :: i, j, b_i, b_j, nx - i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y - nx = gridDim%x +i = threadIdx%x; b_i = blockIdx%x; b_j = blockIdx%y +nx = gridDim%x - do j = 1, nz - u_x(i, b_i, j+(b_j-1)*nz) = u_x(i, b_i, j+(b_j-1)*nz) & - + u_z(i, j, b_i+(b_j-1)*nx) - end do +do j = 1, nz + u_x(i, b_i, j+(b_j-1)*nz) = u_x(i, b_i, j+(b_j-1)*nz) & + + u_z(i, j, b_i+(b_j-1)*nx) +end do - end subroutine sum_zintox +end subroutine sum_zintox - attributes(global) subroutine axpby(n, alpha, x, beta, y) - implicit none +attributes(global) subroutine axpby(n, alpha, x, beta, y) +implicit none - integer, value, intent(in) :: n - real(dp), value, intent(in) :: alpha, beta - real(dp), device, intent(in), dimension(:, :, :) :: x - real(dp), device, intent(inout), dimension(:, :, :) :: y +integer, value, intent(in) :: n +real(dp), value, intent(in) :: alpha, beta +real(dp), device, intent(in), dimension(:, :, :) :: x +real(dp), device, intent(inout), dimension(:, :, :) :: y - integer :: i, j, b +integer :: i, j, b - i = threadIdx%x - b = blockIdx%x +i = threadIdx%x +b = blockIdx%x - do j = 1, n - y(i, j, b) = alpha*x(i, j, b) + beta*y(i, j, b) - end do +do j = 1, n + y(i, j, b) = alpha*x(i, j, b) + beta*y(i, j, b) +end do - end subroutine axpby +end subroutine axpby - attributes(global) subroutine scalar_product(s, x, y, n) - implicit none +attributes(global) subroutine scalar_product(s, x, y, n) +implicit none - real(dp), device, intent(inout) :: s - real(dp), device, intent(in), dimension(:, :, :) :: x, y - integer, value, intent(in) :: n +real(dp), device, intent(inout) :: s +real(dp), device, intent(in), dimension(:, :, :) :: x, y +integer, value, intent(in) :: n - real(dp) :: s_pncl !! pencil sum - integer :: i, j, b, ierr +real(dp) :: s_pncl !! pencil sum +integer :: i, j, b, ierr - i = threadIdx%x - b = blockIdx%x +i = threadIdx%x +b = blockIdx%x - s_pncl = 0._dp - do j = 1, n - s_pncl = s_pncl + x(i, j, b)*y(i, j, b) - end do - ierr = atomicadd(s, s_pncl) +s_pncl = 0._dp +do j = 1, n + s_pncl = s_pncl + x(i, j, b)*y(i, j, b) +end do +ierr = atomicadd(s, s_pncl) - end subroutine scalar_product +end subroutine scalar_product - attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo) - implicit none +attributes(global) subroutine buffer_copy(u_send_s, u_send_e, u, n, n_halo) +implicit none - real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s, u_send_e - real(dp), device, intent(in), dimension(:, :, :) :: u - integer, value, intent(in) :: n, n_halo +real(dp), device, intent(inout), dimension(:, :, :) :: u_send_s, u_send_e +real(dp), device, intent(in), dimension(:, :, :) :: u +integer, value, intent(in) :: n, n_halo - integer :: i, j, b +integer :: i, j, b - i = threadIdx%x - b = blockIdx%x +i = threadIdx%x +b = blockIdx%x - do j = 1, n_halo - u_send_s(i, j, b) = u(i, j, b) - u_send_e(i, j, b) = u(i, n - n_halo + j, b) - end do +do j = 1, n_halo + u_send_s(i, j, b) = u(i, j, b) + u_send_e(i, j, b) = u(i, n - n_halo + j, b) +end do - end subroutine buffer_copy +end subroutine buffer_copy end module m_cuda_kernels_reorder diff --git a/src/cuda/poisson_fft.f90 b/src/cuda/poisson_fft.f90 index 3df787a2..eabcf17e 100644 --- a/src/cuda/poisson_fft.f90 +++ b/src/cuda/poisson_fft.f90 @@ -1,21 +1,21 @@ -module m_cuda_poisson_fft - use cudafor - use cufft + module m_cuda_poisson_fft + use cudafor + use cufft - use m_allocator, only: field_t - use m_common, only: dp - use m_poisson_fft, only: poisson_fft_t - use m_tdsops, only: dirps_t + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t - use m_cuda_allocator, only: cuda_field_t - use m_cuda_common, only: SZ - use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, & - reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, & - process_spectral_div_u + use m_cuda_allocator, only: cuda_field_t + use m_cuda_common, only: SZ + use m_cuda_complex, only: reorder_cmplx_x2y_T, reorder_cmplx_y2x_T, & + reorder_cmplx_y2z_T, reorder_cmplx_z2y_T, & + process_spectral_div_u - implicit none + implicit none - type, extends(poisson_fft_t) :: cuda_poisson_fft_t + type, extends(poisson_fft_t) :: cuda_poisson_fft_t !! FFT based Poisson solver !! It can only handle 1D decompositions along z direction. @@ -25,26 +25,26 @@ module m_cuda_poisson_fft complex(dp), device, allocatable, dimension(:, :, :) :: waves_dev real(dp), device, allocatable, dimension(:) :: ax_dev, bx_dev, & - ay_dev, by_dev, az_dev, bz_dev + ay_dev, by_dev, az_dev, bz_dev real(dp), device, allocatable, dimension(:, :, :) :: f_tmp integer :: planD2Zz, planZ2Dz, planZ2Zx, planZ2Zy - contains + contains procedure :: fft_forward => fft_forward_cuda procedure :: fft_backward => fft_backward_cuda procedure :: fft_postprocess => fft_postprocess_cuda - end type cuda_poisson_fft_t + end type cuda_poisson_fft_t - interface cuda_poisson_fft_t + interface cuda_poisson_fft_t module procedure init - end interface cuda_poisson_fft_t + end interface cuda_poisson_fft_t - private :: init + private :: init -contains + contains - function init(xdirps, ydirps, zdirps) result(poisson_fft) + function init(xdirps, ydirps, zdirps) result(poisson_fft) implicit none class(dirps_t), intent(in) :: xdirps, ydirps, zdirps @@ -82,31 +82,31 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) ! set cufft plans ierrfft = cufftCreate(poisson_fft%planD2Zz) ierrfft = cufftMakePlanMany(poisson_fft%planD2Zz, 1, nz, & - nz, 1, nz, nz/2+1, 1, nz/2+1, & - CUFFT_D2Z, nx*ny, worksize) + nz, 1, nz, nz/2+1, 1, nz/2+1, & + CUFFT_D2Z, nx*ny, worksize) ierrfft = cufftSetWorkArea(poisson_fft%planD2Zz, poisson_fft%c_x_dev) ierrfft = cufftCreate(poisson_fft%planZ2Dz) ierrfft = cufftMakePlanMany(poisson_fft%planZ2Dz, 1, nz, & - nz/2+1, 1, nz/2+1, nz, 1, nz, & - CUFFT_Z2D, nx*ny, worksize) + nz/2+1, 1, nz/2+1, nz, 1, nz, & + CUFFT_Z2D, nx*ny, worksize) ierrfft = cufftSetWorkArea(poisson_fft%planZ2Dz, poisson_fft%c_x_dev) ierrfft = cufftCreate(poisson_fft%planZ2Zy) ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zy, 1, ny, & - ny, 1, ny, ny, 1, ny, & - CUFFT_Z2Z, nx*(nz/2 + 1), worksize) + ny, 1, ny, ny, 1, ny, & + CUFFT_Z2Z, nx*(nz/2 + 1), worksize) ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zy, poisson_fft%c_x_dev) ierrfft = cufftCreate(poisson_fft%planZ2Zx) ierrfft = cufftMakePlanMany(poisson_fft%planZ2Zx, 1, nx, & - nx, 1, nx, nx, 1, nx, & - CUFFT_Z2Z, ny*(nz/2 + 1), worksize) + nx, 1, nx, nx, 1, nx, & + CUFFT_Z2Z, ny*(nz/2 + 1), worksize) ierrfft = cufftSetWorkArea(poisson_fft%planZ2Zx, poisson_fft%c_y_dev) - end function init + end function init - subroutine fft_forward_cuda(self, f) + subroutine fft_forward_cuda(self, f) implicit none class(cuda_poisson_fft_t) :: self @@ -114,7 +114,7 @@ subroutine fft_forward_cuda(self, f) real(dp), device, pointer, dimension(:, :, :) :: f_dev complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & - c_z_ptr + c_z_ptr type(dim3) :: blocks, threads integer :: ierrfft @@ -136,11 +136,11 @@ subroutine fft_forward_cuda(self, f) c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev call reorder_cmplx_z2y_T<<>>(c_y_ptr, c_z_ptr, & - self%nx, self%nz/2 + 1) + self%nx, self%nz/2 + 1) ! In-place forward FFT in y ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & - CUFFT_FORWARD) + CUFFT_FORWARD) ! Reorder from y to x blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) @@ -149,15 +149,15 @@ subroutine fft_forward_cuda(self, f) c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev call reorder_cmplx_y2x_T<<>>(c_x_ptr, c_y_ptr, & - self%nz/2 + 1) + self%nz/2 + 1) ! In-place forward FFT in x ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & - CUFFT_FORWARD) + CUFFT_FORWARD) - end subroutine fft_forward_cuda + end subroutine fft_forward_cuda - subroutine fft_backward_cuda(self, f) + subroutine fft_backward_cuda(self, f) implicit none class(cuda_poisson_fft_t) :: self @@ -165,7 +165,7 @@ subroutine fft_backward_cuda(self, f) real(dp), device, pointer, dimension(:, :, :) :: f_dev complex(dp), device, dimension(:, :, :), pointer :: c_x_ptr, c_y_ptr, & - c_z_ptr + c_z_ptr type(dim3) :: blocks, threads integer :: ierrfft @@ -174,7 +174,7 @@ subroutine fft_backward_cuda(self, f) ! In-place backward FFT in x ierrfft = cufftExecZ2Z(self%planZ2Zx, self%c_x_dev, self%c_x_dev, & - CUFFT_INVERSE) + CUFFT_INVERSE) ! Reorder from x to y blocks = dim3(self%nx/SZ, self%ny/SZ, self%nz/2 + 1) @@ -183,11 +183,11 @@ subroutine fft_backward_cuda(self, f) c_y_ptr(1:self%ny, 1:SZ, 1:(self%nx*(self%nz/2 + 1))/SZ) => self%c_y_dev call reorder_cmplx_x2y_T<<>>(c_y_ptr, c_x_ptr, & - self%nz/2 + 1) + self%nz/2 + 1) ! In-place backward FFT in y ierrfft = cufftExecZ2Z(self%planZ2Zy, self%c_y_dev, self%c_y_dev, & - CUFFT_INVERSE) + CUFFT_INVERSE) ! Reorder from y to z blocks = dim3(self%nz/2/SZ + 1, self%ny/SZ, self%nx) @@ -196,7 +196,7 @@ subroutine fft_backward_cuda(self, f) c_z_ptr(1:self%nz/2 + 1, 1:SZ, 1:self%nx*self%ny/SZ) => self%c_z_dev call reorder_cmplx_y2z_T<<>>(c_z_ptr, c_y_ptr, & - self%nx, self%nz/2 + 1) + self%nx, self%nz/2 + 1) ! Backward FFT transform in z from complex to real ierrfft = cufftExecZ2D(self%planZ2Dz, self%c_z_dev, self%f_tmp) @@ -206,9 +206,9 @@ subroutine fft_backward_cuda(self, f) threads = dim3(SZ, SZ, 1) call reshapeDSB<<>>(f_dev, self%f_tmp) - end subroutine fft_backward_cuda + end subroutine fft_backward_cuda - subroutine fft_postprocess_cuda(self) + subroutine fft_postprocess_cuda(self) implicit none class(cuda_poisson_fft_t) :: self @@ -229,16 +229,16 @@ subroutine fft_postprocess_cuda(self) blocks = dim3((self%ny*(self%nz/2 + 1))/SZ, 1, 1) threads = dim3(SZ, 1, 1) call process_spectral_div_u<<>>( & - c_dev, self%waves_dev, self%nx, self%ny, self%nz, & - self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & - self%az_dev, self%bz_dev & - ) + c_dev, self%waves_dev, self%nx, self%ny, self%nz, & + self%ax_dev, self%bx_dev, self%ay_dev, self%by_dev, & + self%az_dev, self%bz_dev & + ) ! Reshape from our specialist data structure to cartesian-like blocks = dim3(self%nx/SZ, (self%ny*(self%nz/2 + 1))/SZ, 1) threads = dim3(SZ, SZ, 1) call reshapeCDSF<<>>(c_x_ptr, c_dev) - end subroutine fft_postprocess_cuda + end subroutine fft_postprocess_cuda -end module m_cuda_poisson_fft + end module m_cuda_poisson_fft diff --git a/src/cuda/sendrecv.f90 b/src/cuda/sendrecv.f90 index 25b5e9ce..f0eaae46 100644 --- a/src/cuda/sendrecv.f90 +++ b/src/cuda/sendrecv.f90 @@ -1,15 +1,15 @@ -module m_cuda_sendrecv - use cudafor - use mpi + module m_cuda_sendrecv + use cudafor + use mpi - use m_common, only: dp + use m_common, only: dp - implicit none + implicit none -contains + contains - subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & - n_data, nproc, prev, next) + subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & + n_data, nproc, prev, next) implicit none real(dp), device, dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e @@ -19,76 +19,76 @@ subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & integer :: req(4), err(4), ierr, tag = 1234 if (nproc == 1) then - f_recv_s = f_send_e - f_recv_e = f_send_s + f_recv_s = f_send_e + f_recv_e = f_send_s else - call MPI_Isend(f_send_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(1), err(1)) - call MPI_Irecv(f_recv_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(2), err(2)) - call MPI_Isend(f_send_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(3), err(3)) - call MPI_Irecv(f_recv_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(4), err(4)) - - call MPI_Waitall(4, req, MPI_STATUSES_IGNORE, ierr) + call MPI_Isend(f_send_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(1), err(1)) + call MPI_Irecv(f_recv_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(2), err(2)) + call MPI_Isend(f_send_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(3), err(3)) + call MPI_Irecv(f_recv_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(4), err(4)) + + call MPI_Waitall(4, req, MPI_STATUSES_IGNORE, ierr) end if - end subroutine sendrecv_fields + end subroutine sendrecv_fields - subroutine sendrecv_3fields( & + subroutine sendrecv_3fields( & f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e, & f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e, & n_data, nproc, prev, next & - ) + ) implicit none real(dp), device, dimension(:, :, :), intent(out) :: & - f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e + f1_recv_s, f1_recv_e, f2_recv_s, f2_recv_e, f3_recv_s, f3_recv_e real(dp), device, dimension(:, :, :), intent(in) :: & - f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e + f1_send_s, f1_send_e, f2_send_s, f2_send_e, f3_send_s, f3_send_e integer, intent(in) :: n_data, nproc, prev, next integer :: req(12), err(12), ierr, tag = 1234 if (nproc == 1) then - f1_recv_s = f1_send_e - f1_recv_e = f1_send_s - f2_recv_s = f2_send_e - f2_recv_e = f2_send_s - f3_recv_s = f3_send_e - f3_recv_e = f3_send_s + f1_recv_s = f1_send_e + f1_recv_e = f1_send_s + f2_recv_s = f2_send_e + f2_recv_e = f2_send_s + f3_recv_s = f3_send_e + f3_recv_e = f3_send_s else - call MPI_Isend(f1_send_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(1), err(1)) - call MPI_Irecv(f1_recv_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(2), err(2)) - call MPI_Isend(f1_send_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(3), err(3)) - call MPI_Irecv(f1_recv_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(4), err(4)) - - call MPI_Isend(f2_send_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(5), err(5)) - call MPI_Irecv(f2_recv_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(6), err(6)) - call MPI_Isend(f2_send_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(7), err(7)) - call MPI_Irecv(f2_recv_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(8), err(8)) - - call MPI_Isend(f3_send_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(9), err(9)) - call MPI_Irecv(f3_recv_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(10), err(10)) - call MPI_Isend(f3_send_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(11), err(11)) - call MPI_Irecv(f3_recv_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(12), err(12)) - - call MPI_Waitall(12, req, MPI_STATUSES_IGNORE, ierr) + call MPI_Isend(f1_send_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(1), err(1)) + call MPI_Irecv(f1_recv_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(2), err(2)) + call MPI_Isend(f1_send_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(3), err(3)) + call MPI_Irecv(f1_recv_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(4), err(4)) + + call MPI_Isend(f2_send_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(5), err(5)) + call MPI_Irecv(f2_recv_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(6), err(6)) + call MPI_Isend(f2_send_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(7), err(7)) + call MPI_Irecv(f2_recv_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(8), err(8)) + + call MPI_Isend(f3_send_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(9), err(9)) + call MPI_Irecv(f3_recv_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(10), err(10)) + call MPI_Isend(f3_send_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(11), err(11)) + call MPI_Irecv(f3_recv_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(12), err(12)) + + call MPI_Waitall(12, req, MPI_STATUSES_IGNORE, ierr) end if - end subroutine sendrecv_3fields + end subroutine sendrecv_3fields -end module m_cuda_sendrecv + end module m_cuda_sendrecv diff --git a/src/cuda/tdsops.f90 b/src/cuda/tdsops.f90 index aaee541a..7310187c 100644 --- a/src/cuda/tdsops.f90 +++ b/src/cuda/tdsops.f90 @@ -1,32 +1,32 @@ -module m_cuda_tdsops - use iso_fortran_env, only: stderr => error_unit + module m_cuda_tdsops + use iso_fortran_env, only: stderr => error_unit - use m_common, only: dp - use m_tdsops, only: tdsops_t, tdsops_init + use m_common, only: dp + use m_tdsops, only: tdsops_t, tdsops_init - implicit none + implicit none - type, extends(tdsops_t) :: cuda_tdsops_t + type, extends(tdsops_t) :: cuda_tdsops_t !! CUDA extension of the Tridiagonal Solver Operators class. !! !! Regular tdsops_t class is initiated and the coefficient arrays are !! copied into device arrays so that cuda kernels can use them. real(dp), device, allocatable :: dist_fw_dev(:), dist_bw_dev(:), & - dist_sa_dev(:), dist_sc_dev(:), & - dist_af_dev(:) + dist_sa_dev(:), dist_sc_dev(:), & + dist_af_dev(:) real(dp), device, allocatable :: coeffs_dev(:), & - coeffs_s_dev(:, :), coeffs_e_dev(:, :) - contains - end type cuda_tdsops_t + coeffs_s_dev(:, :), coeffs_e_dev(:, :) + contains + end type cuda_tdsops_t - interface cuda_tdsops_t + interface cuda_tdsops_t module procedure cuda_tdsops_init - end interface cuda_tdsops_t + end interface cuda_tdsops_t -contains + contains - function cuda_tdsops_init(n, delta, operation, scheme, n_halo, from_to, & - bc_start, bc_end, sym, c_nu, nu0_nu) & + function cuda_tdsops_init(n, delta, operation, scheme, n_halo, from_to, & + bc_start, bc_end, sym, c_nu, nu0_nu) & result(tdsops) !! Constructor function for the cuda_tdsops_t class. !! See tdsops_t for details. @@ -45,8 +45,8 @@ function cuda_tdsops_init(n, delta, operation, scheme, n_halo, from_to, & integer :: n_stencil tdsops%tdsops_t = tdsops_init(n, delta, operation, scheme, n_halo, & - from_to, bc_start, bc_end, sym, & - c_nu, nu0_nu) + from_to, bc_start, bc_end, sym, & + c_nu, nu0_nu) n_stencil = 2*tdsops%n_halo + 1 @@ -67,7 +67,7 @@ function cuda_tdsops_init(n, delta, operation, scheme, n_halo, from_to, & tdsops%coeffs_s_dev(:, :) = tdsops%coeffs_s(:, :) tdsops%coeffs_e_dev(:, :) = tdsops%coeffs_e(:, :) - end function cuda_tdsops_init + end function cuda_tdsops_init -end module m_cuda_tdsops + end module m_cuda_tdsops diff --git a/src/omp/backend.f90 b/src/omp/backend.f90 index 50ee4d75..d52f51c9 100644 --- a/src/omp/backend.f90 +++ b/src/omp/backend.f90 @@ -1,30 +1,30 @@ -module m_omp_backend - use m_allocator, only: allocator_t, field_t - use m_base_backend, only: base_backend_t - use m_ordering, only: get_index_reordering - use m_common, only: dp, globs_t, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y - use m_tdsops, only: dirps_t, tdsops_t - use m_omp_exec_dist, only: exec_dist_tds_compact, exec_dist_transeq_compact - use m_omp_sendrecv, only: sendrecv_fields + module m_omp_backend + use m_allocator, only: allocator_t, field_t + use m_base_backend, only: base_backend_t + use m_ordering, only: get_index_reordering + use m_common, only: dp, globs_t, & + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + use m_tdsops, only: dirps_t, tdsops_t + use m_omp_exec_dist, only: exec_dist_tds_compact, exec_dist_transeq_compact + use m_omp_sendrecv, only: sendrecv_fields - use m_omp_common, only: SZ - use m_omp_poisson_fft, only: omp_poisson_fft_t + use m_omp_common, only: SZ + use m_omp_poisson_fft, only: omp_poisson_fft_t - implicit none + implicit none - private :: transeq_halo_exchange, transeq_dist_component + private :: transeq_halo_exchange, transeq_dist_component - type, extends(base_backend_t) :: omp_backend_t + type, extends(base_backend_t) :: omp_backend_t !character(len=*), parameter :: name = 'omp' integer :: MPI_FP_PREC = dp real(dp), allocatable, dimension(:, :, :) :: & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - v_recv_s, v_recv_e, v_send_s, v_send_e, & - w_recv_s, w_recv_e, w_send_s, w_send_e, & - du_send_s, du_send_e, du_recv_s, du_recv_e, & - dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & - d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e + u_recv_s, u_recv_e, u_send_s, u_send_e, & + v_recv_s, v_recv_e, v_send_s, v_send_e, & + w_recv_s, w_recv_e, w_send_s, w_send_e, & + du_send_s, du_send_e, du_recv_s, du_recv_e, & + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e contains procedure :: alloc_tdsops => alloc_omp_tdsops procedure :: transeq_x => transeq_x_omp @@ -40,15 +40,15 @@ module m_omp_backend procedure :: copy_f_to_data => copy_f_to_data_omp procedure :: init_poisson_fft => init_omp_poisson_fft procedure :: transeq_omp_dist - end type omp_backend_t + end type omp_backend_t - interface omp_backend_t + interface omp_backend_t module procedure init - end interface omp_backend_t + end interface omp_backend_t - contains + contains - function init(globs, allocator) result(backend) + function init(globs, allocator) result(backend) implicit none class(globs_t) :: globs @@ -58,9 +58,9 @@ function init(globs, allocator) result(backend) integer :: n_halo, n_block select type(allocator) - type is (allocator_t) - ! class level access to the allocator - backend%allocator => allocator + type is (allocator_t) + ! class level access to the allocator + backend%allocator => allocator end select n_halo = 4 @@ -92,9 +92,9 @@ function init(globs, allocator) result(backend) allocate(backend%d2u_recv_s(SZ, 1, n_block)) allocate(backend%d2u_recv_e(SZ, 1, n_block)) - end function init + end function init - subroutine alloc_omp_tdsops( & + subroutine alloc_omp_tdsops( & self, tdsops, n, dx, operation, scheme, & n_halo, from_to, bc_start, bc_end, sym, c_nu, nu0_nu & ) @@ -113,14 +113,14 @@ subroutine alloc_omp_tdsops( & allocate(tdsops_t :: tdsops) select type (tdsops) - type is (tdsops_t) - tdsops = tdsops_t(n, dx, operation, scheme, n_halo, from_to, & - bc_start, bc_end, sym, c_nu, nu0_nu) + type is (tdsops_t) + tdsops = tdsops_t(n, dx, operation, scheme, n_halo, from_to, & + bc_start, bc_end, sym, c_nu, nu0_nu) end select - end subroutine alloc_omp_tdsops + end subroutine alloc_omp_tdsops - subroutine transeq_x_omp(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_x_omp(self, du, dv, dw, u, v, w, dirps) implicit none class(omp_backend_t) :: self @@ -130,9 +130,9 @@ subroutine transeq_x_omp(self, du, dv, dw, u, v, w, dirps) call self%transeq_omp_dist(du, dv, dw, u, v, w, dirps) - end subroutine transeq_x_omp + end subroutine transeq_x_omp - subroutine transeq_y_omp(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_y_omp(self, du, dv, dw, u, v, w, dirps) implicit none class(omp_backend_t) :: self @@ -143,9 +143,9 @@ subroutine transeq_y_omp(self, du, dv, dw, u, v, w, dirps) ! u, v, w is reordered so that we pass v, u, w call self%transeq_omp_dist(dv, du, dw, v, u, w, dirps) - end subroutine transeq_y_omp + end subroutine transeq_y_omp - subroutine transeq_z_omp(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_z_omp(self, du, dv, dw, u, v, w, dirps) implicit none class(omp_backend_t) :: self @@ -156,9 +156,9 @@ subroutine transeq_z_omp(self, du, dv, dw, u, v, w, dirps) ! u, v, w is reordered so that we pass w, u, v call self%transeq_omp_dist(dw, du, dv, w, u, v, dirps) - end subroutine transeq_z_omp + end subroutine transeq_z_omp - subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) + subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) implicit none class(omp_backend_t) :: self @@ -169,25 +169,25 @@ subroutine transeq_omp_dist(self, du, dv, dw, u, v, w, dirps) call transeq_halo_exchange(self, u, v, w, dirps) call transeq_dist_component(self, du, u, u, & - self%u_recv_s, self%u_recv_e, & - self%u_recv_s, self%u_recv_e, & - dirps%der1st, dirps%der1st_sym, & - dirps%der2nd, dirps) + self%u_recv_s, self%u_recv_e, & + self%u_recv_s, self%u_recv_e, & + dirps%der1st, dirps%der1st_sym, & + dirps%der2nd, dirps) call transeq_dist_component(self, dv, v, u, & - self%v_recv_s, self%v_recv_e, & - self%u_recv_s, self%u_recv_e, & - dirps%der1st_sym, dirps%der1st, & - dirps%der2nd_sym, dirps) + self%v_recv_s, self%v_recv_e, & + self%u_recv_s, self%u_recv_e, & + dirps%der1st_sym, dirps%der1st, & + dirps%der2nd_sym, dirps) call transeq_dist_component(self, dw, w, u, & - self%w_recv_s, self%w_recv_e, & - self%u_recv_s, self%u_recv_e, & - dirps%der1st_sym, dirps%der1st, & - dirps%der2nd_sym, dirps) + self%w_recv_s, self%w_recv_e, & + self%u_recv_s, self%u_recv_e, & + dirps%der1st_sym, dirps%der1st, & + dirps%der2nd_sym, dirps) - end subroutine transeq_omp_dist + end subroutine transeq_omp_dist - subroutine transeq_halo_exchange(self, u, v, w, dirps) + subroutine transeq_halo_exchange(self, u, v, w, dirps) class(omp_backend_t) :: self class(field_t), intent(in) :: u, v, w type(dirps_t), intent(in) :: dirps @@ -201,18 +201,18 @@ subroutine transeq_halo_exchange(self, u, v, w, dirps) call copy_into_buffers(self%w_send_s, self%w_send_e, w%data, dirps%n, dirps%n_blocks) call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) call sendrecv_fields(self%v_recv_s, self%v_recv_e, self%v_send_s, self%v_send_e, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) call sendrecv_fields(self%w_recv_s, self%w_recv_e, self%w_send_s, self%w_send_e, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) - end subroutine transeq_halo_exchange + end subroutine transeq_halo_exchange - subroutine transeq_dist_component(self, rhs, u, conv, & - u_recv_s, u_recv_e, & - conv_recv_s, conv_recv_e, & - tdsops_du, tdsops_dud, tdsops_d2u, dirps) + subroutine transeq_dist_component(self, rhs, u, conv, & + u_recv_s, u_recv_e, & + conv_recv_s, conv_recv_e, & + tdsops_du, tdsops_dud, tdsops_d2u, dirps) !! Computes RHS_x^u following: !! !! rhs_x^u = -0.5*(conv*du/dx + d(u*conv)/dx) + nu*d2u/dx2 @@ -220,7 +220,7 @@ subroutine transeq_dist_component(self, rhs, u, conv, & class(field_t), intent(inout) :: rhs class(field_t), intent(in) :: u, conv real(dp), dimension(:, :, :), intent(in) :: u_recv_s, u_recv_e, & - conv_recv_s, conv_recv_e + conv_recv_s, conv_recv_e class(tdsops_t), intent(in) :: tdsops_du class(tdsops_t), intent(in) :: tdsops_dud class(tdsops_t), intent(in) :: tdsops_d2u @@ -232,22 +232,22 @@ subroutine transeq_dist_component(self, rhs, u, conv, & d2u => self%allocator%get_block(dirps%dir) call exec_dist_transeq_compact(& - rhs%data, du%data, dud%data, d2u%data, & - self%du_send_s, self%du_send_e, self%du_recv_s, self%du_recv_e, & - self%dud_send_s, self%dud_send_e, self%dud_recv_s, self%dud_recv_e, & - self%d2u_send_s, self%d2u_send_e, self%d2u_recv_s, self%d2u_recv_e, & - u%data, u_recv_s, u_recv_e, & - conv%data, conv_recv_s, conv_recv_e, & - tdsops_du, tdsops_dud, tdsops_d2u, self%nu, & - dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + rhs%data, du%data, dud%data, d2u%data, & + self%du_send_s, self%du_send_e, self%du_recv_s, self%du_recv_e, & + self%dud_send_s, self%dud_send_e, self%dud_recv_s, self%dud_recv_e, & + self%d2u_send_s, self%d2u_send_e, self%d2u_recv_s, self%d2u_recv_e, & + u%data, u_recv_s, u_recv_e, & + conv%data, conv_recv_s, conv_recv_e, & + tdsops_du, tdsops_dud, tdsops_d2u, self%nu, & + dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) call self%allocator%release_block(du) call self%allocator%release_block(dud) call self%allocator%release_block(d2u) - end subroutine transeq_dist_component + end subroutine transeq_dist_component - subroutine tds_solve_omp(self, du, u, dirps, tdsops) + subroutine tds_solve_omp(self, du, u, dirps, tdsops) implicit none class(omp_backend_t) :: self @@ -258,14 +258,14 @@ subroutine tds_solve_omp(self, du, u, dirps, tdsops) ! Check if direction matches for both in/out fields and dirps if (dirps%dir /= du%dir .or. u%dir /= du%dir) then - error stop 'DIR mismatch between fields and dirps in tds_solve.' + error stop 'DIR mismatch between fields and dirps in tds_solve.' end if call tds_solve_dist(self, du, u, dirps, tdsops) - end subroutine tds_solve_omp + end subroutine tds_solve_omp - subroutine tds_solve_dist(self, du, u, dirps, tdsops) + subroutine tds_solve_dist(self, du, u, dirps, tdsops) implicit none class(omp_backend_t) :: self @@ -281,17 +281,17 @@ subroutine tds_solve_dist(self, du, u, dirps, tdsops) ! halo exchange call sendrecv_fields(self%u_recv_s, self%u_recv_e, self%u_send_s, self%u_send_e, & - SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) + SZ*n_halo*dirps%n_blocks, dirps%nproc, dirps%pprev, dirps%pnext) call exec_dist_tds_compact( & - du%data, u%data, self%u_recv_s, self%u_recv_e, self%du_send_s, self%du_send_e, & - self%du_recv_s, self%du_recv_e, & - tdsops, dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) + du%data, u%data, self%u_recv_s, self%u_recv_e, self%du_send_s, self%du_send_e, & + self%du_recv_s, self%du_recv_e, & + tdsops, dirps%nproc, dirps%pprev, dirps%pnext, dirps%n_blocks) - end subroutine tds_solve_dist + end subroutine tds_solve_dist - subroutine reorder_omp(self, u_, u, direction) + subroutine reorder_omp(self, u_, u, direction) implicit none class(omp_backend_t) :: self @@ -303,63 +303,63 @@ subroutine reorder_omp(self, u_, u, direction) integer :: out_i, out_j, out_k select case (direction) - case (RDR_X2Y) - ndir_loc = self%xdirps%n - ndir_groups = self%xdirps%n_blocks - case (RDR_X2Z) - ndir_loc = self%xdirps%n - ndir_groups = self%xdirps%n_blocks - case (RDR_Y2X) - ndir_loc = self%ydirps%n - ndir_groups = self%ydirps%n_blocks - case (RDR_Y2Z) - ndir_loc = self%ydirps%n - ndir_groups = self%ydirps%n_blocks - case (RDR_Z2X) - ndir_loc = self%zdirps%n - ndir_groups = self%zdirps%n_blocks - case (RDR_Z2Y) - ndir_loc = self%zdirps%n - ndir_groups = self%zdirps%n_blocks - case default - ndir_loc = 0 - ndir_groups = 0 - error stop 'unsuported reordering' + case (RDR_X2Y) + ndir_loc = self%xdirps%n + ndir_groups = self%xdirps%n_blocks + case (RDR_X2Z) + ndir_loc = self%xdirps%n + ndir_groups = self%xdirps%n_blocks + case (RDR_Y2X) + ndir_loc = self%ydirps%n + ndir_groups = self%ydirps%n_blocks + case (RDR_Y2Z) + ndir_loc = self%ydirps%n + ndir_groups = self%ydirps%n_blocks + case (RDR_Z2X) + ndir_loc = self%zdirps%n + ndir_groups = self%zdirps%n_blocks + case (RDR_Z2Y) + ndir_loc = self%zdirps%n + ndir_groups = self%zdirps%n_blocks + case default + ndir_loc = 0 + ndir_groups = 0 + error stop 'unsuported reordering' end select !$omp parallel do private(out_i, out_j, out_k) collapse(2) do k=1, ndir_groups - do j=1, ndir_loc - do i=1, SZ - call get_index_reordering(out_i, out_j, out_k, i, j, k, direction, & - SZ, self%xdirps%n, self%ydirps%n, self%zdirps%n) - u_%data(out_i, out_j, out_k) = u%data(i,j,k) - end do - end do + do j=1, ndir_loc + do i=1, SZ + call get_index_reordering(out_i, out_j, out_k, i, j, k, direction, & + SZ, self%xdirps%n, self%ydirps%n, self%zdirps%n) + u_%data(out_i, out_j, out_k) = u%data(i,j,k) + end do + end do end do !$omp end parallel do - end subroutine reorder_omp + end subroutine reorder_omp - subroutine sum_yintox_omp(self, u, u_) + subroutine sum_yintox_omp(self, u, u_) implicit none class(omp_backend_t) :: self class(field_t), intent(inout) :: u class(field_t), intent(in) :: u_ - end subroutine sum_yintox_omp + end subroutine sum_yintox_omp - subroutine sum_zintox_omp(self, u, u_) + subroutine sum_zintox_omp(self, u, u_) implicit none class(omp_backend_t) :: self class(field_t), intent(inout) :: u class(field_t), intent(in) :: u_ - end subroutine sum_zintox_omp + end subroutine sum_zintox_omp - subroutine vecadd_omp(self, a, x, b, y) + subroutine vecadd_omp(self, a, x, b, y) implicit none class(omp_backend_t) :: self @@ -368,9 +368,9 @@ subroutine vecadd_omp(self, a, x, b, y) real(dp), intent(in) :: b class(field_t), intent(inout) :: y - end subroutine vecadd_omp + end subroutine vecadd_omp - real(dp) function scalar_product_omp(self, x, y) result(s) + real(dp) function scalar_product_omp(self, x, y) result(s) implicit none class(omp_backend_t) :: self @@ -378,9 +378,9 @@ real(dp) function scalar_product_omp(self, x, y) result(s) s = 0._dp - end function scalar_product_omp + end function scalar_product_omp - subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) + subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) implicit none real(dp), dimension(:, :, :), intent(out) :: u_send_s, u_send_e @@ -392,36 +392,36 @@ subroutine copy_into_buffers(u_send_s, u_send_e, u, n, n_blocks) !$omp parallel do do k=1, n_blocks - do j=1, n_halo - !$omp simd - do i=1, SZ - u_send_s(i, j, k) = u(i, j, k) - u_send_e(i, j, k) = u(i, n - n_halo + j, k) - end do - !$omp end simd - end do + do j=1, n_halo + !$omp simd + do i=1, SZ + u_send_s(i, j, k) = u(i, j, k) + u_send_e(i, j, k) = u(i, n - n_halo + j, k) + end do + !$omp end simd + end do end do !$omp end parallel do - end subroutine copy_into_buffers + end subroutine copy_into_buffers - subroutine copy_data_to_f_omp(self, f, data) + subroutine copy_data_to_f_omp(self, f, data) class(omp_backend_t), intent(inout) :: self class(field_t), intent(inout) :: f real(dp), dimension(:, :, :), intent(in) :: data f%data = data - end subroutine copy_data_to_f_omp + end subroutine copy_data_to_f_omp - subroutine copy_f_to_data_omp(self, data, f) + subroutine copy_f_to_data_omp(self, data, f) class(omp_backend_t), intent(inout) :: self real(dp), dimension(:, :, :), intent(out) :: data class(field_t), intent(in) :: f data = f%data - end subroutine copy_f_to_data_omp + end subroutine copy_f_to_data_omp - subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) + subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) implicit none class(omp_backend_t) :: self @@ -430,11 +430,11 @@ subroutine init_omp_poisson_fft(self, xdirps, ydirps, zdirps) allocate(omp_poisson_fft_t :: self%poisson_fft) select type (poisson_fft => self%poisson_fft) - type is (omp_poisson_fft_t) - poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) + type is (omp_poisson_fft_t) + poisson_fft = omp_poisson_fft_t(xdirps, ydirps, zdirps) end select - end subroutine init_omp_poisson_fft + end subroutine init_omp_poisson_fft -end module m_omp_backend + end module m_omp_backend diff --git a/src/omp/common.f90 b/src/omp/common.f90 index cb1d00b7..4744c237 100644 --- a/src/omp/common.f90 +++ b/src/omp/common.f90 @@ -1,6 +1,6 @@ -module m_omp_common - implicit none + module m_omp_common + implicit none - integer, parameter :: SZ=16 + integer, parameter :: SZ=16 -end module m_omp_common + end module m_omp_common diff --git a/src/omp/exec_dist.f90 b/src/omp/exec_dist.f90 index e713c08a..737daca9 100644 --- a/src/omp/exec_dist.f90 +++ b/src/omp/exec_dist.f90 @@ -1,17 +1,17 @@ -module m_omp_exec_dist - use mpi + module m_omp_exec_dist + use mpi - use m_common, only: dp - use m_omp_common, only: SZ - use m_omp_kernels_dist, only: der_univ_dist, der_univ_subs - use m_tdsops, only: tdsops_t - use m_omp_sendrecv, only: sendrecv_fields + use m_common, only: dp + use m_omp_common, only: SZ + use m_omp_kernels_dist, only: der_univ_dist, der_univ_subs + use m_tdsops, only: tdsops_t + use m_omp_sendrecv, only: sendrecv_fields - implicit none + implicit none -contains + contains - subroutine exec_dist_tds_compact( & + subroutine exec_dist_tds_compact( & du, u, u_recv_s, u_recv_e, du_send_s, du_send_e, du_recv_s, du_recv_e, & tdsops, nproc, pprev, pnext, n_block & ) @@ -25,7 +25,7 @@ subroutine exec_dist_tds_compact( & ! not because we actually need the data they store later where this ! subroutine is called. We absolutely don't care about the data they pass back real(dp), dimension(:, :, :), intent(out) :: & - du_send_s, du_send_e, du_recv_s, du_recv_e + du_send_s, du_send_e, du_recv_s, du_recv_e type(tdsops_t), intent(in) :: tdsops integer, intent(in) :: nproc, pprev, pnext @@ -38,31 +38,31 @@ subroutine exec_dist_tds_compact( & !$omp parallel do do k = 1, n_block - call der_univ_dist( & - du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & - u_recv_s(:, :, k), u_recv_e(:, :, k), & - tdsops%coeffs_s, tdsops%coeffs_e, tdsops%coeffs, tdsops%n, & - tdsops%dist_fw, tdsops%dist_bw, tdsops%dist_af & - ) + call der_univ_dist( & + du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & + u_recv_s(:, :, k), u_recv_e(:, :, k), & + tdsops%coeffs_s, tdsops%coeffs_e, tdsops%coeffs, tdsops%n, & + tdsops%dist_fw, tdsops%dist_bw, tdsops%dist_af & + ) end do !$omp end parallel do ! halo exchange for 2x2 systems call sendrecv_fields(du_recv_s, du_recv_e, du_send_s, du_send_e, & - n_data, nproc, pprev, pnext) + n_data, nproc, pprev, pnext) !$omp parallel do do k = 1, n_block - call der_univ_subs(du(:, :, k), & - du_recv_s(:, :, k), du_recv_e(:, :, k), & - tdsops%n, tdsops%dist_sa, tdsops%dist_sc) + call der_univ_subs(du(:, :, k), & + du_recv_s(:, :, k), du_recv_e(:, :, k), & + tdsops%n, tdsops%dist_sa, tdsops%dist_sc) end do !$omp end parallel do - end subroutine exec_dist_tds_compact + end subroutine exec_dist_tds_compact - subroutine exec_dist_transeq_compact(& + subroutine exec_dist_transeq_compact(& rhs, du, dud, d2u, & du_send_s, du_send_e, du_recv_s, du_recv_e, & dud_send_s, dud_send_e, dud_recv_s, dud_recv_e, & @@ -80,11 +80,11 @@ subroutine exec_dist_transeq_compact(& ! not because we actually need the data they store later where this ! subroutine is called. We absolutely don't care about the data they pass back real(dp), dimension(:, :, :), intent(out) :: & - du_send_s, du_send_e, du_recv_s, du_recv_e + du_send_s, du_send_e, du_recv_s, du_recv_e real(dp), dimension(:, :, :), intent(out) :: & - dud_send_s, dud_send_e, dud_recv_s, dud_recv_e + dud_send_s, dud_send_e, dud_recv_s, dud_recv_e real(dp), dimension(:, :, :), intent(out) :: & - d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e + d2u_send_s, d2u_send_e, d2u_recv_s, d2u_recv_e real(dp), dimension(:, :, :), intent(in) :: u, u_recv_s, u_recv_e real(dp), dimension(:, :, :), intent(in) :: v, v_recv_s, v_recv_e @@ -110,85 +110,85 @@ subroutine exec_dist_transeq_compact(& !$omp parallel do private(ud, ud_recv_e, ud_recv_s) do k = 1, n_block - call der_univ_dist( & - du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & - u_recv_s(:, :, k), u_recv_e(:, :, k), & - tdsops_du%coeffs_s, tdsops_du%coeffs_e, tdsops_du%coeffs, tdsops_du%n, & - tdsops_du%dist_fw, tdsops_du%dist_bw, tdsops_du%dist_af & - ) - - call der_univ_dist( & - d2u(:, :, k), d2u_send_s(:, :, k), d2u_send_e(:, :, k), u(:, :, k), & - u_recv_s(:, :, k), u_recv_e(:, :, k), & - tdsops_d2u%coeffs_s, tdsops_d2u%coeffs_e, tdsops_d2u%coeffs, tdsops_d2u%n, & - tdsops_d2u%dist_fw, tdsops_d2u%dist_bw, tdsops_d2u%dist_af & - ) - - ! Handle dud by locally generating u*v - do j = 1, n - !$omp simd - do i = 1, SZ - ud(i, j) = u(i, j, k) * v(i, j, k) - end do - !$omp end simd - end do - - do j = 1, n_halo - !$omp simd - do i = 1, SZ - ud_recv_s(i, j) = u_recv_s(i, j, k) * v_recv_s(i, j, k) - ud_recv_e(i, j) = u_recv_e(i, j, k) * v_recv_e(i, j, k) - end do - !$omp end simd - end do - - call der_univ_dist( & - dud(:, :, k), dud_send_s(:, :, k), dud_send_e(:, :, k), ud(:, :), & - ud_recv_s(:, :), ud_recv_e(:, :), & - tdsops_dud%coeffs_s, tdsops_dud%coeffs_e, tdsops_dud%coeffs, tdsops_dud%n, & - tdsops_dud%dist_fw, tdsops_dud%dist_bw, tdsops_dud%dist_af & - ) - + call der_univ_dist( & + du(:, :, k), du_send_s(:, :, k), du_send_e(:, :, k), u(:, :, k), & + u_recv_s(:, :, k), u_recv_e(:, :, k), & + tdsops_du%coeffs_s, tdsops_du%coeffs_e, tdsops_du%coeffs, tdsops_du%n, & + tdsops_du%dist_fw, tdsops_du%dist_bw, tdsops_du%dist_af & + ) + + call der_univ_dist( & + d2u(:, :, k), d2u_send_s(:, :, k), d2u_send_e(:, :, k), u(:, :, k), & + u_recv_s(:, :, k), u_recv_e(:, :, k), & + tdsops_d2u%coeffs_s, tdsops_d2u%coeffs_e, tdsops_d2u%coeffs, tdsops_d2u%n, & + tdsops_d2u%dist_fw, tdsops_d2u%dist_bw, tdsops_d2u%dist_af & + ) + + ! Handle dud by locally generating u*v + do j = 1, n + !$omp simd + do i = 1, SZ + ud(i, j) = u(i, j, k) * v(i, j, k) + end do + !$omp end simd + end do + + do j = 1, n_halo + !$omp simd + do i = 1, SZ + ud_recv_s(i, j) = u_recv_s(i, j, k) * v_recv_s(i, j, k) + ud_recv_e(i, j) = u_recv_e(i, j, k) * v_recv_e(i, j, k) + end do + !$omp end simd + end do + + call der_univ_dist( & + dud(:, :, k), dud_send_s(:, :, k), dud_send_e(:, :, k), ud(:, :), & + ud_recv_s(:, :), ud_recv_e(:, :), & + tdsops_dud%coeffs_s, tdsops_dud%coeffs_e, tdsops_dud%coeffs, tdsops_dud%n, & + tdsops_dud%dist_fw, tdsops_dud%dist_bw, tdsops_dud%dist_af & + ) + end do !$omp end parallel do ! halo exchange for 2x2 systems call sendrecv_fields(du_recv_s, du_recv_e, du_send_s, du_send_e, & - n_data, nproc, pprev, pnext) + n_data, nproc, pprev, pnext) call sendrecv_fields(dud_recv_s, dud_recv_e, dud_send_s, dud_send_e, & - n_data, nproc, pprev, pnext) + n_data, nproc, pprev, pnext) call sendrecv_fields(d2u_recv_s, d2u_recv_e, d2u_send_s, d2u_send_e, & - n_data, nproc, pprev, pnext) + n_data, nproc, pprev, pnext) !$omp parallel do do k = 1, n_block - call der_univ_subs(du(:, :, k), & - du_recv_s(:, :, k), du_recv_e(:, :, k), & - tdsops_du%n, tdsops_du%dist_sa, tdsops_du%dist_sc) - - call der_univ_subs(dud(:, :, k), & - dud_recv_s(:, :, k), dud_recv_e(:, :, k), & - tdsops_dud%n, tdsops_dud%dist_sa, tdsops_dud%dist_sc) - - call der_univ_subs(d2u(:, :, k), & - d2u_recv_s(:, :, k), d2u_recv_e(:, :, k), & - tdsops_d2u%n, tdsops_d2u%dist_sa, tdsops_d2u%dist_sc) - - do j = 1, n - !$omp simd - do i = 1, SZ - rhs(i, j, k) = -0.5_dp*(v(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) - end do - !$omp end simd - end do + call der_univ_subs(du(:, :, k), & + du_recv_s(:, :, k), du_recv_e(:, :, k), & + tdsops_du%n, tdsops_du%dist_sa, tdsops_du%dist_sc) + + call der_univ_subs(dud(:, :, k), & + dud_recv_s(:, :, k), dud_recv_e(:, :, k), & + tdsops_dud%n, tdsops_dud%dist_sa, tdsops_dud%dist_sc) + + call der_univ_subs(d2u(:, :, k), & + d2u_recv_s(:, :, k), d2u_recv_e(:, :, k), & + tdsops_d2u%n, tdsops_d2u%dist_sa, tdsops_d2u%dist_sc) + + do j = 1, n + !$omp simd + do i = 1, SZ + rhs(i, j, k) = -0.5_dp*(v(i, j, k)*du(i, j, k) + dud(i, j, k)) + nu*d2u(i, j, k) + end do + !$omp end simd + end do end do !$omp end parallel do - end subroutine exec_dist_transeq_compact + end subroutine exec_dist_transeq_compact -end module m_omp_exec_dist + end module m_omp_exec_dist diff --git a/src/omp/kernels/distributed.f90 b/src/omp/kernels/distributed.f90 index a92c10ab..8bc2c9bf 100644 --- a/src/omp/kernels/distributed.f90 +++ b/src/omp/kernels/distributed.f90 @@ -1,14 +1,14 @@ -module m_omp_kernels_dist - use omp_lib + module m_omp_kernels_dist + use omp_lib - use m_common, only: dp - use m_omp_common, only: SZ + use m_common, only: dp + use m_omp_common, only: SZ - implicit none + implicit none -contains + contains - subroutine der_univ_dist( & + subroutine der_univ_dist( & du, send_u_s, send_u_e, u, u_s, u_e, coeffs_s, coeffs_e, coeffs, n, & ffr, fbc, faf & ) @@ -26,7 +26,7 @@ subroutine der_univ_dist( & integer :: i, j!, b real(dp) :: c_m4, c_m3, c_m2, c_m1, c_j, c_p1, c_p2, c_p3, c_p4, & - temp_du, alpha, last_r + temp_du, alpha, last_r ! store bulk coeffs in the registers c_m4 = coeffs(1); c_m3 = coeffs(2); c_m2 = coeffs(3); c_m1 = coeffs(4) @@ -36,46 +36,46 @@ subroutine der_univ_dist( & !$omp simd do i = 1, SZ - du(i, 1) = coeffs_s(1, 1)*u_s(i, 1) & - + coeffs_s(2, 1)*u_s(i, 2) & - + coeffs_s(3, 1)*u_s(i, 3) & - + coeffs_s(4, 1)*u_s(i, 4) & - + coeffs_s(5, 1)*u(i, 1) & - + coeffs_s(6, 1)*u(i, 2) & - + coeffs_s(7, 1)*u(i, 3) & - + coeffs_s(8, 1)*u(i, 4) & - + coeffs_s(9, 1)*u(i, 5) - du(i, 1) = du(i, 1)*faf(1) - du(i, 2) = coeffs_s(1, 2)*u_s(i, 2) & - + coeffs_s(2, 2)*u_s(i, 3) & - + coeffs_s(3, 2)*u_s(i, 4) & - + coeffs_s(4, 2)*u(i, 1) & - + coeffs_s(5, 2)*u(i, 2) & - + coeffs_s(6, 2)*u(i, 3) & - + coeffs_s(7, 2)*u(i, 4) & - + coeffs_s(8, 2)*u(i, 5) & - + coeffs_s(9, 2)*u(i, 6) - du(i, 2) = du(i, 2)*faf(2) - du(i, 3) = coeffs_s(1, 3)*u_s(i, 3) & - + coeffs_s(2, 3)*u_s(i, 4) & - + coeffs_s(3, 3)*u(i, 1) & - + coeffs_s(4, 3)*u(i, 2) & - + coeffs_s(5, 3)*u(i, 3) & - + coeffs_s(6, 3)*u(i, 4) & - + coeffs_s(7, 3)*u(i, 5) & - + coeffs_s(8, 3)*u(i, 6) & - + coeffs_s(9, 3)*u(i, 7) - du(i, 3) = ffr(3)*(du(i, 3) - faf(3)*du(i, 2)) - du(i, 4) = coeffs_s(1, 4)*u_s(i, 4) & - + coeffs_s(2, 4)*u(i, 1) & - + coeffs_s(3, 4)*u(i, 2) & - + coeffs_s(4, 4)*u(i, 3) & - + coeffs_s(5, 4)*u(i, 4) & - + coeffs_s(6, 4)*u(i, 5) & - + coeffs_s(7, 4)*u(i, 6) & - + coeffs_s(8, 4)*u(i, 7) & - + coeffs_s(9, 4)*u(i, 8) - du(i, 4) = ffr(4)*(du(i, 4) - faf(4)*du(i, 3)) + du(i, 1) = coeffs_s(1, 1)*u_s(i, 1) & + + coeffs_s(2, 1)*u_s(i, 2) & + + coeffs_s(3, 1)*u_s(i, 3) & + + coeffs_s(4, 1)*u_s(i, 4) & + + coeffs_s(5, 1)*u(i, 1) & + + coeffs_s(6, 1)*u(i, 2) & + + coeffs_s(7, 1)*u(i, 3) & + + coeffs_s(8, 1)*u(i, 4) & + + coeffs_s(9, 1)*u(i, 5) + du(i, 1) = du(i, 1)*faf(1) + du(i, 2) = coeffs_s(1, 2)*u_s(i, 2) & + + coeffs_s(2, 2)*u_s(i, 3) & + + coeffs_s(3, 2)*u_s(i, 4) & + + coeffs_s(4, 2)*u(i, 1) & + + coeffs_s(5, 2)*u(i, 2) & + + coeffs_s(6, 2)*u(i, 3) & + + coeffs_s(7, 2)*u(i, 4) & + + coeffs_s(8, 2)*u(i, 5) & + + coeffs_s(9, 2)*u(i, 6) + du(i, 2) = du(i, 2)*faf(2) + du(i, 3) = coeffs_s(1, 3)*u_s(i, 3) & + + coeffs_s(2, 3)*u_s(i, 4) & + + coeffs_s(3, 3)*u(i, 1) & + + coeffs_s(4, 3)*u(i, 2) & + + coeffs_s(5, 3)*u(i, 3) & + + coeffs_s(6, 3)*u(i, 4) & + + coeffs_s(7, 3)*u(i, 5) & + + coeffs_s(8, 3)*u(i, 6) & + + coeffs_s(9, 3)*u(i, 7) + du(i, 3) = ffr(3)*(du(i, 3) - faf(3)*du(i, 2)) + du(i, 4) = coeffs_s(1, 4)*u_s(i, 4) & + + coeffs_s(2, 4)*u(i, 1) & + + coeffs_s(3, 4)*u(i, 2) & + + coeffs_s(4, 4)*u(i, 3) & + + coeffs_s(5, 4)*u(i, 4) & + + coeffs_s(6, 4)*u(i, 5) & + + coeffs_s(7, 4)*u(i, 6) & + + coeffs_s(8, 4)*u(i, 7) & + + coeffs_s(9, 4)*u(i, 8) + du(i, 4) = ffr(4)*(du(i, 4) - faf(4)*du(i, 3)) end do !$omp end simd @@ -83,91 +83,91 @@ subroutine der_univ_dist( & alpha = faf(5) do j = 5, n - 4 - !$omp simd - do i = 1, SZ - temp_du = c_m4*u(i, j - 4) + c_m3*u(i, j - 3) & - + c_m2*u(i, j - 2) + c_m1*u(i, j - 1) & - + c_j*u(i, j) & - + c_p1*u(i, j + 1) + c_p2*u(i, j + 2) & - + c_p3*u(i, j + 3) + c_p4*u(i, j + 4) - du(i, j) = ffr(j)*(temp_du - alpha*du(i, j - 1)) - end do - !$omp end simd + !$omp simd + do i = 1, SZ + temp_du = c_m4*u(i, j - 4) + c_m3*u(i, j - 3) & + + c_m2*u(i, j - 2) + c_m1*u(i, j - 1) & + + c_j*u(i, j) & + + c_p1*u(i, j + 1) + c_p2*u(i, j + 2) & + + c_p3*u(i, j + 3) + c_p4*u(i, j + 4) + du(i, j) = ffr(j)*(temp_du - alpha*du(i, j - 1)) + end do + !$omp end simd end do !$omp simd do i = 1, SZ - j = n - 3 - du(i, j) = coeffs_e(1, 1)*u(i, j - 4) & - + coeffs_e(2, 1)*u(i, j - 3) & - + coeffs_e(3, 1)*u(i, j - 2) & - + coeffs_e(4, 1)*u(i, j - 1) & - + coeffs_e(5, 1)*u(i, j) & - + coeffs_e(6, 1)*u(i, j + 1) & - + coeffs_e(7, 1)*u(i, j + 2) & - + coeffs_e(8, 1)*u(i, j + 3) & - + coeffs_e(9, 1)*u_e(i, 1) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) - j = n - 2 - du(i, j) = coeffs_e(1, 2)*u(i, j - 4) & - + coeffs_e(2, 2)*u(i, j - 3) & - + coeffs_e(3, 2)*u(i, j - 2) & - + coeffs_e(4, 2)*u(i, j - 1) & - + coeffs_e(5, 2)*u(i, j) & - + coeffs_e(6, 2)*u(i, j + 1) & - + coeffs_e(7, 2)*u(i, j + 2) & - + coeffs_e(8, 2)*u_e(i, 1) & - + coeffs_e(9, 2)*u_e(i, 2) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) - j = n - 1 - du(i, j) = coeffs_e(1, 3)*u(i, j - 4) & - + coeffs_e(2, 3)*u(i, j - 3) & - + coeffs_e(3, 3)*u(i, j - 2) & - + coeffs_e(4, 3)*u(i, j - 1) & - + coeffs_e(5, 3)*u(i, j) & - + coeffs_e(6, 3)*u(i, j + 1) & - + coeffs_e(7, 3)*u_e(i, 1) & - + coeffs_e(8, 3)*u_e(i, 2) & - + coeffs_e(9, 3)*u_e(i, 3) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) - j = n - du(i, j) = coeffs_e(1, 4)*u(i, j - 4) & - + coeffs_e(2, 4)*u(i, j - 3) & - + coeffs_e(3, 4)*u(i, j - 2) & - + coeffs_e(4, 4)*u(i, j - 1) & - + coeffs_e(5, 4)*u(i, j) & - + coeffs_e(6, 4)*u_e(i, 1) & - + coeffs_e(7, 4)*u_e(i, 2) & - + coeffs_e(8, 4)*u_e(i, 3) & - + coeffs_e(9, 4)*u_e(i, 4) - du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) + j = n - 3 + du(i, j) = coeffs_e(1, 1)*u(i, j - 4) & + + coeffs_e(2, 1)*u(i, j - 3) & + + coeffs_e(3, 1)*u(i, j - 2) & + + coeffs_e(4, 1)*u(i, j - 1) & + + coeffs_e(5, 1)*u(i, j) & + + coeffs_e(6, 1)*u(i, j + 1) & + + coeffs_e(7, 1)*u(i, j + 2) & + + coeffs_e(8, 1)*u(i, j + 3) & + + coeffs_e(9, 1)*u_e(i, 1) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) + j = n - 2 + du(i, j) = coeffs_e(1, 2)*u(i, j - 4) & + + coeffs_e(2, 2)*u(i, j - 3) & + + coeffs_e(3, 2)*u(i, j - 2) & + + coeffs_e(4, 2)*u(i, j - 1) & + + coeffs_e(5, 2)*u(i, j) & + + coeffs_e(6, 2)*u(i, j + 1) & + + coeffs_e(7, 2)*u(i, j + 2) & + + coeffs_e(8, 2)*u_e(i, 1) & + + coeffs_e(9, 2)*u_e(i, 2) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) + j = n - 1 + du(i, j) = coeffs_e(1, 3)*u(i, j - 4) & + + coeffs_e(2, 3)*u(i, j - 3) & + + coeffs_e(3, 3)*u(i, j - 2) & + + coeffs_e(4, 3)*u(i, j - 1) & + + coeffs_e(5, 3)*u(i, j) & + + coeffs_e(6, 3)*u(i, j + 1) & + + coeffs_e(7, 3)*u_e(i, 1) & + + coeffs_e(8, 3)*u_e(i, 2) & + + coeffs_e(9, 3)*u_e(i, 3) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) + j = n + du(i, j) = coeffs_e(1, 4)*u(i, j - 4) & + + coeffs_e(2, 4)*u(i, j - 3) & + + coeffs_e(3, 4)*u(i, j - 2) & + + coeffs_e(4, 4)*u(i, j - 1) & + + coeffs_e(5, 4)*u(i, j) & + + coeffs_e(6, 4)*u_e(i, 1) & + + coeffs_e(7, 4)*u_e(i, 2) & + + coeffs_e(8, 4)*u_e(i, 3) & + + coeffs_e(9, 4)*u_e(i, 4) + du(i, j) = ffr(j)*(du(i, j) - faf(j)*du(i, j - 1)) end do !$omp end simd !$omp simd do i = 1, SZ - send_u_e(i, 1) = du(i, n) + send_u_e(i, 1) = du(i, n) end do !$omp end simd ! Backward pass of the hybrid algorithm do j = n - 2, 2, -1 - !$omp simd - do i = 1, SZ - du(i, j) = du(i, j) - fbc(j)*du(i, j + 1) - end do - !$omp end simd + !$omp simd + do i = 1, SZ + du(i, j) = du(i, j) - fbc(j)*du(i, j + 1) + end do + !$omp end simd end do !$omp simd do i = 1, SZ - du(i, 1) = last_r*(du(i, 1) - fbc(1)*du(i, 2)) - send_u_s(i, 1) = du(i, 1) + du(i, 1) = last_r*(du(i, 1) - fbc(1)*du(i, 2)) + send_u_s(i, 1) = du(i, 1) end do !$omp end simd - end subroutine der_univ_dist + end subroutine der_univ_dist - subroutine der_univ_subs(du, recv_u_s, recv_u_e, n, dist_sa, dist_sc) + subroutine der_univ_subs(du, recv_u_s, recv_u_e, n, dist_sa, dist_sc) implicit none ! Arguments @@ -183,48 +183,48 @@ subroutine der_univ_subs(du, recv_u_s, recv_u_e, n, dist_sa, dist_sc) !$omp simd do i = 1, SZ - ! A small trick we do here is valid for symmetric Toeplitz matrices. - ! In our case our matrices satisfy this criteria in the (5:n-4) region - ! and as long as a rank has around at least 20 entries the assumptions - ! we make here are perfectly valid. - - ! bl is the bottom left entry in the 2x2 matrix - ! ur is the upper right entry in the 2x2 matrix - - ! Start - ! At the start we have the 'bl', and assume 'ur' - bl = dist_sa(1) - ur = dist_sa(1) - recp = 1._dp/(1._dp - ur*bl) - du_s(i) = recp*(du(i, 1) - bl*recv_u_s(i, 1)) - - ! End - ! At the end we have the 'ur', and assume 'bl' - bl = dist_sc(n) - ur = dist_sc(n) - recp = 1._dp/(1._dp - ur*bl) - du_e(i) = recp*(du(i, n) - ur*recv_u_e(i, 1)) + ! A small trick we do here is valid for symmetric Toeplitz matrices. + ! In our case our matrices satisfy this criteria in the (5:n-4) region + ! and as long as a rank has around at least 20 entries the assumptions + ! we make here are perfectly valid. + + ! bl is the bottom left entry in the 2x2 matrix + ! ur is the upper right entry in the 2x2 matrix + + ! Start + ! At the start we have the 'bl', and assume 'ur' + bl = dist_sa(1) + ur = dist_sa(1) + recp = 1._dp/(1._dp - ur*bl) + du_s(i) = recp*(du(i, 1) - bl*recv_u_s(i, 1)) + + ! End + ! At the end we have the 'ur', and assume 'bl' + bl = dist_sc(n) + ur = dist_sc(n) + recp = 1._dp/(1._dp - ur*bl) + du_e(i) = recp*(du(i, n) - ur*recv_u_e(i, 1)) end do !$omp end simd !$omp simd do i = 1, SZ - du(i, 1) = du_s(i) + du(i, 1) = du_s(i) end do !$omp end simd do j = 2, n - 1 - !$omp simd - do i = 1, SZ - du(i, j) = (du(i, j) - dist_sa(j)*du_s(i) - dist_sc(j)*du_e(i)) - end do - !$omp end simd + !$omp simd + do i = 1, SZ + du(i, j) = (du(i, j) - dist_sa(j)*du_s(i) - dist_sc(j)*du_e(i)) + end do + !$omp end simd end do !$omp simd do i = 1, SZ - du(i, n) = du_e(i) + du(i, n) = du_e(i) end do !$omp end simd - end subroutine der_univ_subs + end subroutine der_univ_subs -end module m_omp_kernels_dist + end module m_omp_kernels_dist diff --git a/src/omp/poisson_fft.f90 b/src/omp/poisson_fft.f90 index a796f1fc..60bd53c6 100644 --- a/src/omp/poisson_fft.f90 +++ b/src/omp/poisson_fft.f90 @@ -1,32 +1,32 @@ -module m_omp_poisson_fft - use m_allocator, only: field_t - use m_common, only: dp - use m_poisson_fft, only: poisson_fft_t - use m_tdsops, only: dirps_t + module m_omp_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp + use m_poisson_fft, only: poisson_fft_t + use m_tdsops, only: dirps_t - use m_omp_common, only: SZ + use m_omp_common, only: SZ - implicit none + implicit none - type, extends(poisson_fft_t) :: omp_poisson_fft_t + type, extends(poisson_fft_t) :: omp_poisson_fft_t !! FFT based Poisson solver !! It can only handle 1D decompositions along z direction. complex(dp), allocatable, dimension(:, :, :) :: c_x, c_y, c_z - contains + contains procedure :: fft_forward => fft_forward_omp procedure :: fft_backward => fft_backward_omp procedure :: fft_postprocess => fft_postprocess_omp - end type omp_poisson_fft_t + end type omp_poisson_fft_t - interface omp_poisson_fft_t + interface omp_poisson_fft_t module procedure init - end interface omp_poisson_fft_t + end interface omp_poisson_fft_t - private :: init + private :: init -contains + contains - function init(xdirps, ydirps, zdirps) result(poisson_fft) + function init(xdirps, ydirps, zdirps) result(poisson_fft) implicit none class(dirps_t), intent(in) :: xdirps, ydirps, zdirps @@ -35,26 +35,26 @@ function init(xdirps, ydirps, zdirps) result(poisson_fft) call poisson_fft%base_init(xdirps, ydirps, zdirps, SZ) - end function init + end function init - subroutine fft_forward_omp(self, f_in) + subroutine fft_forward_omp(self, f_in) implicit none class(omp_poisson_fft_t) :: self class(field_t), intent(in) :: f_in - end subroutine fft_forward_omp + end subroutine fft_forward_omp - subroutine fft_backward_omp(self, f_out) + subroutine fft_backward_omp(self, f_out) implicit none class(omp_poisson_fft_t) :: self class(field_t), intent(inout) :: f_out - end subroutine fft_backward_omp + end subroutine fft_backward_omp - subroutine fft_postprocess_omp(self) + subroutine fft_postprocess_omp(self) implicit none class(omp_poisson_fft_t) :: self - end subroutine fft_postprocess_omp + end subroutine fft_postprocess_omp -end module m_omp_poisson_fft + end module m_omp_poisson_fft diff --git a/src/omp/sendrecv.f90 b/src/omp/sendrecv.f90 index 0ebd4bca..bb5a5594 100644 --- a/src/omp/sendrecv.f90 +++ b/src/omp/sendrecv.f90 @@ -1,14 +1,14 @@ -module m_omp_sendrecv - use mpi + module m_omp_sendrecv + use mpi - use m_common, only: dp + use m_common, only: dp - implicit none + implicit none -contains + contains - subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & - n_data, nproc, prev, next) + subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & + n_data, nproc, prev, next) implicit none real(dp), dimension(:, :, :), intent(out) :: f_recv_s, f_recv_e @@ -18,21 +18,21 @@ subroutine sendrecv_fields(f_recv_s, f_recv_e, f_send_s, f_send_e, & integer :: req(4), err(4), ierr, tag = 1234 if (nproc == 1) then - f_recv_s = f_send_e - f_recv_e = f_send_s + f_recv_s = f_send_e + f_recv_e = f_send_s else - call MPI_Isend(f_send_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(1), err(1)) - call MPI_Irecv(f_recv_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(2), err(2)) - call MPI_Isend(f_send_e, n_data, MPI_DOUBLE_PRECISION, & - next, tag, MPI_COMM_WORLD, req(3), err(3)) - call MPI_Irecv(f_recv_s, n_data, MPI_DOUBLE_PRECISION, & - prev, tag, MPI_COMM_WORLD, req(4), err(4)) - - call MPI_Waitall(4, req, MPI_STATUSES_IGNORE, ierr) + call MPI_Isend(f_send_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(1), err(1)) + call MPI_Irecv(f_recv_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(2), err(2)) + call MPI_Isend(f_send_e, n_data, MPI_DOUBLE_PRECISION, & + next, tag, MPI_COMM_WORLD, req(3), err(3)) + call MPI_Irecv(f_recv_s, n_data, MPI_DOUBLE_PRECISION, & + prev, tag, MPI_COMM_WORLD, req(4), err(4)) + + call MPI_Waitall(4, req, MPI_STATUSES_IGNORE, ierr) end if - end subroutine sendrecv_fields + end subroutine sendrecv_fields -end module m_omp_sendrecv + end module m_omp_sendrecv diff --git a/src/ordering.f90 b/src/ordering.f90 index 5fb6475d..fa73aec8 100644 --- a/src/ordering.f90 +++ b/src/ordering.f90 @@ -1,65 +1,65 @@ -module m_ordering + module m_ordering - use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y + use m_common, only: dp, DIR_X, DIR_Y, DIR_Z, & + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y - implicit none -contains - !! - !! "Application storage" stores spatial data with a directionality for better cache locality - !! This set of functions converts indices from this application storage (_dir) to cartesian indices (_ijk) - !! + implicit none + contains + !! + !! "Application storage" stores spatial data with a directionality for better cache locality + !! This set of functions converts indices from this application storage (_dir) to cartesian indices (_ijk) + !! - pure subroutine get_index_ijk(i, j, k, dir_i, dir_j, dir_k, dir, SZ, nx_loc, ny_loc, nz_loc) + pure subroutine get_index_ijk(i, j, k, dir_i, dir_j, dir_k, dir, SZ, nx_loc, ny_loc, nz_loc) !! Get cartesian index from application storage directional one integer, intent(out) :: i, j, k ! cartesian indices integer, intent(in) :: dir_i, dir_j, dir_k ! application storage indices integer, intent(in) :: dir ! direction of the applicatino storage indices integer, intent(in) :: SZ, nx_loc, ny_loc, nz_loc ! dimensions of the block - + select case (dir) - case (DIR_X) - i = dir_j - j = mod(dir_k - 1, ny_loc/SZ)*SZ + dir_i - k = 1 + (dir_k - 1)/(ny_loc/SZ) - case (DIR_Y) - i = mod(dir_k - 1, nx_loc/SZ)*SZ + dir_i - j = dir_j - k = 1 + (dir_k - 1)/(nx_loc/SZ) - case (DIR_Z) - i = mod(dir_k - 1, nx_loc/SZ)*SZ + dir_i - j = 1 + (dir_k - 1)/(nx_loc/SZ) - k = dir_j + case (DIR_X) + i = dir_j + j = mod(dir_k - 1, ny_loc/SZ)*SZ + dir_i + k = 1 + (dir_k - 1)/(ny_loc/SZ) + case (DIR_Y) + i = mod(dir_k - 1, nx_loc/SZ)*SZ + dir_i + j = dir_j + k = 1 + (dir_k - 1)/(nx_loc/SZ) + case (DIR_Z) + i = mod(dir_k - 1, nx_loc/SZ)*SZ + dir_i + j = 1 + (dir_k - 1)/(nx_loc/SZ) + k = dir_j end select - end subroutine get_index_ijk + end subroutine get_index_ijk - pure subroutine get_index_dir(dir_i, dir_j, dir_k, i, j, k, dir, SZ, nx_loc, ny_loc, nz_loc) + pure subroutine get_index_dir(dir_i, dir_j, dir_k, i, j, k, dir, SZ, nx_loc, ny_loc, nz_loc) !! Get application storage directional index from cartesian index integer, intent(out) :: dir_i, dir_j, dir_k ! application storage indices integer, intent(in) :: i, j, k ! cartesian indices integer, intent(in) :: dir ! direction of the application storage indices integer, intent(in) :: SZ, nx_loc, ny_loc, nz_loc ! dimensions of the block - + select case (dir) - case (DIR_X) - dir_i = mod(j-1, SZ) + 1 - dir_j = i - dir_k = (ny_loc/SZ)*(k-1) + 1 + (j-1)/SZ - case (DIR_Y) - dir_i = mod(i-1, SZ) + 1 - dir_j = j - dir_k = (nx_loc/SZ)*(k-1) + 1 + (i-1)/SZ - case (DIR_Z) - dir_i = mod(i-1, SZ) + 1 - dir_j = k - dir_k = (nx_loc/SZ)*(j-1) + 1 + (i-1)/SZ + case (DIR_X) + dir_i = mod(j-1, SZ) + 1 + dir_j = i + dir_k = (ny_loc/SZ)*(k-1) + 1 + (j-1)/SZ + case (DIR_Y) + dir_i = mod(i-1, SZ) + 1 + dir_j = j + dir_k = (nx_loc/SZ)*(k-1) + 1 + (i-1)/SZ + case (DIR_Z) + dir_i = mod(i-1, SZ) + 1 + dir_j = k + dir_k = (nx_loc/SZ)*(j-1) + 1 + (i-1)/SZ end select - end subroutine get_index_dir + end subroutine get_index_dir - pure subroutine get_index_reordering(out_i, out_j, out_k, in_i, in_j, in_k, reorder_dir, SZ, nx_loc, ny_loc, nz_loc) - !! Converts a set of application storage directional index to an other direction. + pure subroutine get_index_reordering(out_i, out_j, out_k, in_i, in_j, in_k, reorder_dir, SZ, nx_loc, ny_loc, nz_loc) + !! Converts a set of application storage directional index to an other direction. !! The two directions are defined by the reorder_dir variable, RDR_X2Y will go from storage in X to Y etc. integer, intent(out) :: out_i, out_j, out_k ! new indices in the application storage integer, intent(in) :: in_i, in_j, in_k ! original indices @@ -69,30 +69,30 @@ pure subroutine get_index_reordering(out_i, out_j, out_k, in_i, in_j, in_k, reor integer :: dir_in, dir_out select case (reorder_dir) - case (RDR_X2Y) - dir_in = DIR_X - dir_out = DIR_Y - case (RDR_X2Z) - dir_in = DIR_X - dir_out = DIR_Z - case (RDR_Y2X) - dir_in = DIR_Y - dir_out = DIR_X - case (RDR_Y2Z) - dir_in = DIR_Y - dir_out = DIR_Z - case (RDR_Z2X) - dir_in = DIR_Z - dir_out = DIR_X - case (RDR_Z2Y) - dir_in = DIR_Z - dir_out = DIR_Y + case (RDR_X2Y) + dir_in = DIR_X + dir_out = DIR_Y + case (RDR_X2Z) + dir_in = DIR_X + dir_out = DIR_Z + case (RDR_Y2X) + dir_in = DIR_Y + dir_out = DIR_X + case (RDR_Y2Z) + dir_in = DIR_Y + dir_out = DIR_Z + case (RDR_Z2X) + dir_in = DIR_Z + dir_out = DIR_X + case (RDR_Z2Y) + dir_in = DIR_Z + dir_out = DIR_Y end select call get_index_ijk(i, j, k, in_i, in_j, in_k, dir_in, SZ, nx_loc, ny_loc, nz_loc) call get_index_dir(out_i, out_j, out_k, i, j, k, dir_out, SZ, nx_loc, ny_loc, nz_loc) - end subroutine get_index_reordering + end subroutine get_index_reordering -end module m_ordering + end module m_ordering diff --git a/src/poisson_fft.f90 b/src/poisson_fft.f90 index 9ee0f575..d25154ff 100644 --- a/src/poisson_fft.f90 +++ b/src/poisson_fft.f90 @@ -1,54 +1,54 @@ -module m_poisson_fft - use m_allocator, only: field_t - use m_common, only: dp, pi - use m_tdsops, only: dirps_t + module m_poisson_fft + use m_allocator, only: field_t + use m_common, only: dp, pi + use m_tdsops, only: dirps_t - implicit none + implicit none - type, abstract :: poisson_fft_t + type, abstract :: poisson_fft_t !! FFT based Poisson solver !! It can only handle 1D decompositions along z direction. integer :: nx, ny, nz complex(dp), allocatable, dimension(:, :, :) :: waves complex(dp), allocatable, dimension(:) :: ax, bx, ay, by, az, bz - contains + contains procedure(fft_forward), deferred :: fft_forward procedure(fft_backward), deferred :: fft_backward procedure(fft_postprocess), deferred :: fft_postprocess procedure :: base_init procedure :: waves_set - end type poisson_fft_t + end type poisson_fft_t - abstract interface + abstract interface subroutine fft_forward(self, f_in) - import :: poisson_fft_t - import :: field_t - implicit none + import :: poisson_fft_t + import :: field_t + implicit none - class(poisson_fft_t) :: self - class(field_t), intent(in) :: f_in + class(poisson_fft_t) :: self + class(field_t), intent(in) :: f_in end subroutine fft_forward subroutine fft_backward(self, f_out) - import :: poisson_fft_t - import :: field_t - implicit none + import :: poisson_fft_t + import :: field_t + implicit none - class(poisson_fft_t) :: self - class(field_t), intent(inout) :: f_out + class(poisson_fft_t) :: self + class(field_t), intent(inout) :: f_out end subroutine fft_backward subroutine fft_postprocess(self) - import :: poisson_fft_t - implicit none + import :: poisson_fft_t + implicit none - class(poisson_fft_t) :: self + class(poisson_fft_t) :: self end subroutine fft_postprocess - end interface + end interface -contains + contains - subroutine base_init(self, xdirps, ydirps, zdirps, sz) + subroutine base_init(self, xdirps, ydirps, zdirps, sz) implicit none class(poisson_fft_t) :: self @@ -66,9 +66,9 @@ subroutine base_init(self, xdirps, ydirps, zdirps, sz) ! waves_set requires some of the preprocessed tdsops variables. call self%waves_set(xdirps, ydirps, zdirps, sz) - end subroutine base_init + end subroutine base_init - subroutine waves_set(self, xdirps, ydirps, zdirps, sz) + subroutine waves_set(self, xdirps, ydirps, zdirps, sz) !! Ref. JCP 228 (2009), 5989–6015, Sec 4 implicit none @@ -77,7 +77,7 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) integer, intent(in) :: sz complex(dp), allocatable, dimension(:) :: xkx, xk2, yky, yk2, zkz, zk2, & - exs, eys, ezs + exs, eys, ezs integer :: nx, ny, nz real(dp) :: w, wp, rlexs, rleys, rlezs, xtt, ytt, ztt, xt1, yt1, zt1 @@ -88,18 +88,18 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) nx = xdirps%n; ny = ydirps%n; nz = zdirps%n do i = 1, nx - self%ax(i) = sin((i-1)*pi/nx) - self%bx(i) = cos((i-1)*pi/nx) + self%ax(i) = sin((i-1)*pi/nx) + self%bx(i) = cos((i-1)*pi/nx) end do do i = 1, ny - self%ay(i) = sin((i-1)*pi/ny) - self%by(i) = cos((i-1)*pi/ny) + self%ay(i) = sin((i-1)*pi/ny) + self%by(i) = cos((i-1)*pi/ny) end do do i = 1, nz - self%az(i) = sin((i-1)*pi/nz) - self%bz(i) = cos((i-1)*pi/nz) + self%az(i) = sin((i-1)*pi/nz) + self%bz(i) = cos((i-1)*pi/nz) end do ! Now kxyz @@ -110,89 +110,89 @@ subroutine waves_set(self, xdirps, ydirps, zdirps, sz) ! periodic-x do i = 1, nx/2 + 1 - w = 2*pi*(i - 1)/nx - wp = xdirps%stagder_v2p%a*2*xdirps%d*sin(0.5_dp*w) & - + xdirps%stagder_v2p%b*2*xdirps%d*sin(1.5_dp*w) - wp = wp/(1._dp + 2*xdirps%stagder_v2p%alpha*cos(w)) - - xkx(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L) - exs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*w/xdirps%L) - xk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)**2 + w = 2*pi*(i - 1)/nx + wp = xdirps%stagder_v2p%a*2*xdirps%d*sin(0.5_dp*w) & + + xdirps%stagder_v2p%b*2*xdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*xdirps%stagder_v2p%alpha*cos(w)) + + xkx(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L) + exs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*w/xdirps%L) + xk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nx*wp/xdirps%L)**2 end do do i = nx/2 + 2, nx - xkx(i) = xkx(nx - i + 2) - exs(i) = exs(nx - i + 2) - xk2(i) = xk2(nx - i + 2) + xkx(i) = xkx(nx - i + 2) + exs(i) = exs(nx - i + 2) + xk2(i) = xk2(nx - i + 2) end do ! periodic-y do i = 1, ny/2 + 1 - w = 2*pi*(i - 1)/ny - wp = ydirps%stagder_v2p%a*2*ydirps%d*sin(0.5_dp*w) & - + ydirps%stagder_v2p%b*2*ydirps%d*sin(1.5_dp*w) - wp = wp/(1._dp + 2*ydirps%stagder_v2p%alpha*cos(w)) - - yky(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L) - eys(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*w/ydirps%L) - yk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)**2 + w = 2*pi*(i - 1)/ny + wp = ydirps%stagder_v2p%a*2*ydirps%d*sin(0.5_dp*w) & + + ydirps%stagder_v2p%b*2*ydirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*ydirps%stagder_v2p%alpha*cos(w)) + + yky(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L) + eys(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*w/ydirps%L) + yk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(ny*wp/ydirps%L)**2 end do do i = ny/2 + 2, ny - yky(i) = yky(ny-i+2) - eys(i) = eys(ny-i+2) - yk2(i) = yk2(ny-i+2) + yky(i) = yky(ny-i+2) + eys(i) = eys(ny-i+2) + yk2(i) = yk2(ny-i+2) end do ! periodic-z do i = 1, nz/2 + 1 - w = 2*pi*(i - 1)/nz - wp = zdirps%stagder_v2p%a*2*zdirps%d*sin(0.5_dp*w) & - + zdirps%stagder_v2p%b*2*zdirps%d*sin(1.5_dp*w) - wp = wp/(1._dp + 2*zdirps%stagder_v2p%alpha*cos(w)) - - zkz(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L) - ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L) - zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 + w = 2*pi*(i - 1)/nz + wp = zdirps%stagder_v2p%a*2*zdirps%d*sin(0.5_dp*w) & + + zdirps%stagder_v2p%b*2*zdirps%d*sin(1.5_dp*w) + wp = wp/(1._dp + 2*zdirps%stagder_v2p%alpha*cos(w)) + + zkz(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L) + ezs(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*w/zdirps%L) + zk2(i) = cmplx(1._dp, 1._dp, kind=dp)*(nz*wp/zdirps%L)**2 end do print*, 'waves array is correctly set only for a single rank run' ! TODO: do loop ranges below are valid only for single rank runs do ka = 1, nz/2 + 1 - do kb = 1, ny/sz - do j = 1, nx - do i = 1, sz - ix = j; iy = (kb - 1)*sz + i; iz = ka - rlexs = real(exs(ix), kind=dp)*xdirps%d - rleys = real(eys(iy), kind=dp)*ydirps%d - rlezs = real(ezs(iz), kind=dp)*zdirps%d - - xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) & - + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) & - + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) & - + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp)) - ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) & - + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) & - + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) & - + ydirps%interpl_v2p%d*cos(rleys*3.5_dp)) - ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) & - + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) & - + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) & - + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp)) - - xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs) - yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys) - zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs) - - xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2) - yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2) - zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2) - - xyzk = xt2 + yt2 + zt2 - self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk - end do + do kb = 1, ny/sz + do j = 1, nx + do i = 1, sz + ix = j; iy = (kb - 1)*sz + i; iz = ka + rlexs = real(exs(ix), kind=dp)*xdirps%d + rleys = real(eys(iy), kind=dp)*ydirps%d + rlezs = real(ezs(iz), kind=dp)*zdirps%d + + xtt = 2*(xdirps%interpl_v2p%a*cos(rlexs*0.5_dp) & + + xdirps%interpl_v2p%b*cos(rlexs*1.5_dp) & + + xdirps%interpl_v2p%c*cos(rlexs*2.5_dp) & + + xdirps%interpl_v2p%d*cos(rlexs*3.5_dp)) + ytt = 2*(ydirps%interpl_v2p%a*cos(rleys*0.5_dp) & + + ydirps%interpl_v2p%b*cos(rleys*1.5_dp) & + + ydirps%interpl_v2p%c*cos(rleys*2.5_dp) & + + ydirps%interpl_v2p%d*cos(rleys*3.5_dp)) + ztt = 2*(zdirps%interpl_v2p%a*cos(rlezs*0.5_dp) & + + zdirps%interpl_v2p%b*cos(rlezs*1.5_dp) & + + zdirps%interpl_v2p%c*cos(rlezs*2.5_dp) & + + zdirps%interpl_v2p%d*cos(rlezs*3.5_dp)) + + xt1 = 1._dp + 2*xdirps%interpl_v2p%alpha*cos(rlexs) + yt1 = 1._dp + 2*ydirps%interpl_v2p%alpha*cos(rleys) + zt1 = 1._dp + 2*zdirps%interpl_v2p%alpha*cos(rlezs) + + xt2 = xk2(ix)*(((ytt/yt1)*(ztt/zt1))**2) + yt2 = yk2(iy)*(((xtt/xt1)*(ztt/zt1))**2) + zt2 = zk2(iz)*(((xtt/xt1)*(ytt/yt1))**2) + + xyzk = xt2 + yt2 + zt2 + self%waves(i, j, ka + (kb - 1)*(nz/2 + 1)) = xyzk end do - end do + end do + end do end do - end subroutine waves_set + end subroutine waves_set -end module m_poisson_fft + end module m_poisson_fft diff --git a/src/solver.f90 b/src/solver.f90 index 977eb5d9..9c462ec7 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -1,16 +1,16 @@ -module m_solver - use m_allocator, only: allocator_t, field_t - use m_base_backend, only: base_backend_t - use m_common, only: dp, globs_t, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & - POISSON_SOLVER_FFT, POISSON_SOLVER_CG, & - DIR_X, DIR_Y, DIR_Z - use m_tdsops, only: tdsops_t, dirps_t - use m_time_integrator, only: time_intg_t - - implicit none - - type :: solver_t + module m_solver + use m_allocator, only: allocator_t, field_t + use m_base_backend, only: base_backend_t + use m_common, only: dp, globs_t, & + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG, & + DIR_X, DIR_Y, DIR_Z + use m_tdsops, only: tdsops_t, dirps_t + use m_time_integrator, only: time_intg_t + + implicit none + + type :: solver_t !! solver class defines the Incompact3D algorithm at a very high level. !! !! Procedures defined here that are part of the Incompact3D algorithm @@ -47,34 +47,34 @@ module m_solver class(dirps_t), pointer :: xdirps, ydirps, zdirps class(time_intg_t), pointer :: time_integrator procedure(poisson_solver), pointer :: poisson => null() - contains + contains procedure :: transeq procedure :: divergence_v2p procedure :: gradient_p2v procedure :: curl procedure :: output procedure :: run - end type solver_t + end type solver_t - abstract interface + abstract interface subroutine poisson_solver(self, pressure, div_u) - import :: solver_t - import :: field_t - implicit none + import :: solver_t + import :: field_t + implicit none - class(solver_t) :: self - class(field_t), intent(inout) :: pressure - class(field_t), intent(in) :: div_u + class(solver_t) :: self + class(field_t), intent(inout) :: pressure + class(field_t), intent(in) :: div_u end subroutine poisson_solver - end interface + end interface - interface solver_t + interface solver_t module procedure init - end interface solver_t + end interface solver_t -contains + contains - function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & + function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & result(solver) implicit none @@ -114,17 +114,17 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & nx = globs%nx_loc; ny = globs%ny_loc; nz = globs%nz_loc do k = 1, nz - do j = 1, ny - do i = 1, nx - x = (i - 1)*globs%dx - y = (j - 1)*globs%dy - z = (k - 1)*globs%dz - - u_init(i, j, k) = sin(x)*cos(y)*cos(z) - v_init(i, j, k) = -cos(x)*sin(y)*cos(z) - w_init(i, j, k) = 0 - end do - end do + do j = 1, ny + do i = 1, nx + x = (i - 1)*globs%dx + y = (j - 1)*globs%dy + z = (k - 1)*globs%dz + + u_init(i, j, k) = sin(x)*cos(y)*cos(z) + v_init(i, j, k) = -cos(x)*sin(y)*cos(z) + w_init(i, j, k) = 0 + end do + end do end do call solver%backend%set_field_data(solver%u, u_init) @@ -140,44 +140,44 @@ function init(backend, time_integrator, xdirps, ydirps, zdirps, globs) & call allocate_tdsops(solver%zdirps, nz, globs%dz, solver%backend) select case (globs%poisson_solver_type) - case (POISSON_SOLVER_FFT) - print*, 'Poisson solver: FFT' - call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps) - solver%poisson => poisson_fft - case (POISSON_SOLVER_CG) - print*, 'Poisson solver: CG, not yet implemented' - solver%poisson => poisson_cg + case (POISSON_SOLVER_FFT) + print*, 'Poisson solver: FFT' + call solver%backend%init_poisson_fft(xdirps, ydirps, zdirps) + solver%poisson => poisson_fft + case (POISSON_SOLVER_CG) + print*, 'Poisson solver: CG, not yet implemented' + solver%poisson => poisson_cg end select - end function init + end function init - subroutine allocate_tdsops(dirps, nx, dx, backend) + subroutine allocate_tdsops(dirps, nx, dx, backend) class(dirps_t), intent(inout) :: dirps real(dp), intent(in) :: dx integer, intent(in) :: nx class(base_backend_t), intent(in) :: backend call backend%alloc_tdsops(dirps%der1st, nx, dx, & - 'first-deriv', 'compact6') + 'first-deriv', 'compact6') call backend%alloc_tdsops(dirps%der1st_sym, nx, dx, & - 'first-deriv', 'compact6') + 'first-deriv', 'compact6') call backend%alloc_tdsops(dirps%der2nd, nx, dx, & - 'second-deriv', 'compact6') + 'second-deriv', 'compact6') call backend%alloc_tdsops(dirps%der2nd_sym, nx, dx, & - 'second-deriv', 'compact6') + 'second-deriv', 'compact6') call backend%alloc_tdsops(dirps%interpl_v2p, nx, dx, & - 'interpolate', 'classic', from_to='v2p') + 'interpolate', 'classic', from_to='v2p') call backend%alloc_tdsops(dirps%interpl_p2v, nx, dx, & - 'interpolate', 'classic', from_to='p2v') + 'interpolate', 'classic', from_to='p2v') call backend%alloc_tdsops(dirps%stagder_v2p, nx, dx, & - 'stag-deriv', 'compact6', from_to='v2p') + 'stag-deriv', 'compact6', from_to='v2p') call backend%alloc_tdsops(dirps%stagder_p2v, nx, dx, & - 'stag-deriv', 'compact6', from_to='p2v') + 'stag-deriv', 'compact6', from_to='p2v') - end subroutine + end subroutine - subroutine transeq(self, du, dv, dw, u, v, w) + subroutine transeq(self, du, dv, dw, u, v, w) !! Skew-symmetric form of convection-diffusion terms in the !! incompressible Navier-Stokes momemtum equations, excluding !! pressure terms. @@ -189,7 +189,7 @@ subroutine transeq(self, du, dv, dw, u, v, w) class(field_t), intent(in) :: u, v, w class(field_t), pointer :: u_y, v_y, w_y, u_z, v_z, w_z, & - du_y, dv_y, dw_y, du_z, dv_z, dw_z + du_y, dv_y, dw_y, du_z, dv_z, dw_z ! -1/2(nabla u curl u + u nabla u) + nu nablasq u @@ -260,9 +260,9 @@ subroutine transeq(self, du, dv, dw, u, v, w) call self%backend%allocator%release_block(dv_z) call self%backend%allocator%release_block(dw_z) - end subroutine transeq + end subroutine transeq - subroutine divergence_v2p(self, div_u, u, v, w) + subroutine divergence_v2p(self, div_u, u, v, w) !! Divergence of a vector field (u, v, w). !! Inputs from velocity grid and outputs to pressure grid. implicit none @@ -272,8 +272,8 @@ subroutine divergence_v2p(self, div_u, u, v, w) class(field_t), intent(in) :: u, v, w class(field_t), pointer :: du_x, dv_x, dw_x, & - u_y, v_y, w_y, du_y, dv_y, dw_y, & - u_z, w_z, dw_z + u_y, v_y, w_y, du_y, dv_y, dw_y, & + u_z, w_z, dw_z du_x => self%backend%allocator%get_block(DIR_X) dv_x => self%backend%allocator%get_block(DIR_X) @@ -283,11 +283,11 @@ subroutine divergence_v2p(self, div_u, u, v, w) ! Interpolation for v field in x ! Interpolation for w field in x call self%backend%tds_solve(du_x, u, self%xdirps, & - self%xdirps%stagder_v2p) + self%xdirps%stagder_v2p) call self%backend%tds_solve(dv_x, v, self%xdirps, & - self%xdirps%interpl_v2p) + self%xdirps%interpl_v2p) call self%backend%tds_solve(dw_x, w, self%xdirps, & - self%xdirps%interpl_v2p) + self%xdirps%interpl_v2p) ! request fields from the allocator u_y => self%backend%allocator%get_block(DIR_Y) @@ -309,11 +309,11 @@ subroutine divergence_v2p(self, div_u, u, v, w) ! similar to the x direction, obtain derivatives in y. call self%backend%tds_solve(du_y, u_y, self%ydirps, & - self%ydirps%interpl_v2p) + self%ydirps%interpl_v2p) call self%backend%tds_solve(dv_y, v_y, self%ydirps, & - self%ydirps%stagder_v2p) + self%ydirps%stagder_v2p) call self%backend%tds_solve(dw_y, w_y, self%ydirps, & - self%ydirps%interpl_v2p) + self%ydirps%interpl_v2p) ! we don't need the velocities in y orientation any more, so release ! them to open up space. @@ -343,9 +343,9 @@ subroutine divergence_v2p(self, div_u, u, v, w) ! get the derivatives in z call self%backend%tds_solve(div_u, u_z, self%zdirps, & - self%zdirps%interpl_v2p) + self%zdirps%interpl_v2p) call self%backend%tds_solve(dw_z, w_z, self%zdirps, & - self%zdirps%stagder_v2p) + self%zdirps%stagder_v2p) ! div_u = div_u + dw_z call self%backend%vecadd(1._dp, dw_z, 1._dp, div_u) @@ -357,9 +357,9 @@ subroutine divergence_v2p(self, div_u, u, v, w) call self%backend%allocator%release_block(w_z) call self%backend%allocator%release_block(dw_z) - end subroutine divergence_v2p + end subroutine divergence_v2p - subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) + subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) !! Gradient of a scalar field 'pressure'. !! Inputs from pressure grid and outputs to velocity grid. implicit none @@ -369,9 +369,9 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) class(field_t), intent(in) :: pressure class(field_t), pointer :: p_sxy_z, dpdz_sxy_z, & - p_sxy_y, dpdz_sxy_y, & - p_sx_y, dpdy_sx_y, dpdz_sx_y, & - p_sx_x, dpdy_sx_x, dpdz_sx_x + p_sxy_y, dpdz_sxy_y, & + p_sx_y, dpdy_sx_y, dpdz_sx_y, & + p_sx_x, dpdy_sx_x, dpdz_sx_x p_sxy_z => self%backend%allocator%get_block(DIR_Z) dpdz_sxy_z => self%backend%allocator%get_block(DIR_Z) @@ -379,9 +379,9 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) ! Staggared der for pressure field in z ! Interpolation for pressure field in z call self%backend%tds_solve(p_sxy_z, pressure, self%zdirps, & - self%zdirps%interpl_p2v) + self%zdirps%interpl_p2v) call self%backend%tds_solve(dpdz_sxy_z, pressure, self%zdirps, & - self%zdirps%stagder_p2v) + self%zdirps%stagder_p2v) ! request fields from the allocator p_sxy_y => self%backend%allocator%get_block(DIR_Y) @@ -400,11 +400,11 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) ! similar to the z direction, obtain derivatives in y. call self%backend%tds_solve(p_sx_y, p_sxy_y, self%ydirps, & - self%ydirps%interpl_p2v) + self%ydirps%interpl_p2v) call self%backend%tds_solve(dpdy_sx_y, p_sxy_y, self%ydirps, & - self%ydirps%stagder_p2v) + self%ydirps%stagder_p2v) call self%backend%tds_solve(dpdz_sx_y, dpdz_sxy_y, self%ydirps, & - self%ydirps%interpl_p2v) + self%ydirps%interpl_p2v) ! release memory call self%backend%allocator%release_block(p_sxy_y) @@ -427,20 +427,20 @@ subroutine gradient_p2v(self, dpdx, dpdy, dpdz, pressure) ! get the derivatives in x call self%backend%tds_solve(dpdx, p_sx_x, self%xdirps, & - self%xdirps%stagder_p2v) + self%xdirps%stagder_p2v) call self%backend%tds_solve(dpdy, dpdy_sx_x, self%xdirps, & - self%xdirps%interpl_p2v) + self%xdirps%interpl_p2v) call self%backend%tds_solve(dpdz, dpdz_sx_x, self%xdirps, & - self%xdirps%interpl_p2v) + self%xdirps%interpl_p2v) ! release temporary x fields call self%backend%allocator%release_block(p_sx_x) call self%backend%allocator%release_block(dpdy_sx_x) call self%backend%allocator%release_block(dpdz_sx_x) - end subroutine gradient_p2v + end subroutine gradient_p2v - subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) + subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) !! Curl of a vector field (u, v, w). !! Inputs from velocity grid and outputs to velocity grid. implicit none @@ -451,7 +451,7 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) class(field_t), intent(in) :: u, v, w class(field_t), pointer :: u_y, u_z, v_z, w_y, dwdy_y, dvdz_z, dvdz_x, & - dudz_z, dudz_x, dudy_y, dudy_x + dudz_z, dudz_x, dudy_y, dudy_x ! omega_i_hat = dw/dy - dv/dz ! omega_j_hat = du/dz - dw/dx @@ -528,9 +528,9 @@ subroutine curl(self, o_i_hat, o_j_hat, o_k_hat, u, v, w) call self%backend%allocator%release_block(dudy_x) - end subroutine curl + end subroutine curl - subroutine poisson_fft(self, pressure, div_u) + subroutine poisson_fft(self, pressure, div_u) implicit none class(solver_t) :: self @@ -547,18 +547,18 @@ subroutine poisson_fft(self, pressure, div_u) ! call backward FFT call self%backend%poisson_fft%fft_backward(pressure) - end subroutine poisson_fft + end subroutine poisson_fft - subroutine poisson_cg(self, pressure, div_u) + subroutine poisson_cg(self, pressure, div_u) implicit none class(solver_t) :: self class(field_t), intent(inout) :: pressure class(field_t), intent(in) :: div_u - end subroutine poisson_cg + end subroutine poisson_cg - subroutine output(self, t, u_out) + subroutine output(self, t, u_out) implicit none class(solver_t), intent(in) :: self @@ -577,10 +577,10 @@ subroutine output(self, t, u_out) call self%curl(du, dv, dw, self%u, self%v, self%w) print*, 'enstrophy:', 0.5_dp*( & - self%backend%scalar_product(du, du) & - + self%backend%scalar_product(dv, dv) & - + self%backend%scalar_product(dw, dw) & - )/ngrid + self%backend%scalar_product(du, du) & + + self%backend%scalar_product(dv, dv) & + + self%backend%scalar_product(dw, dw) & + )/ngrid call self%backend%allocator%release_block(du) call self%backend%allocator%release_block(dv) @@ -595,9 +595,9 @@ subroutine output(self, t, u_out) print*, 'div u max mean:', maxval(abs(u_out)), sum(abs(u_out))/ngrid - end subroutine output + end subroutine output - subroutine run(self, u_out, v_out, w_out) + subroutine run(self, u_out, v_out, w_out) implicit none class(solver_t), intent(in) :: self @@ -615,52 +615,52 @@ subroutine run(self, u_out, v_out, w_out) print*, 'start run' do i = 1, self%n_iters - du => self%backend%allocator%get_block(DIR_X) - dv => self%backend%allocator%get_block(DIR_X) - dw => self%backend%allocator%get_block(DIR_X) + du => self%backend%allocator%get_block(DIR_X) + dv => self%backend%allocator%get_block(DIR_X) + dw => self%backend%allocator%get_block(DIR_X) - call self%transeq(du, dv, dw, self%u, self%v, self%w) + call self%transeq(du, dv, dw, self%u, self%v, self%w) - ! time integration - call self%time_integrator%step(self%u, self%v, self%w, & - du, dv, dw, self%dt) + ! time integration + call self%time_integrator%step(self%u, self%v, self%w, & + du, dv, dw, self%dt) - call self%backend%allocator%release_block(du) - call self%backend%allocator%release_block(dv) - call self%backend%allocator%release_block(dw) + call self%backend%allocator%release_block(du) + call self%backend%allocator%release_block(dv) + call self%backend%allocator%release_block(dw) - ! pressure - div_u => self%backend%allocator%get_block(DIR_Z) + ! pressure + div_u => self%backend%allocator%get_block(DIR_Z) - call self%divergence_v2p(div_u, self%u, self%v, self%w) + call self%divergence_v2p(div_u, self%u, self%v, self%w) - pressure => self%backend%allocator%get_block(DIR_Z) + pressure => self%backend%allocator%get_block(DIR_Z) - call self%poisson(pressure, div_u) + call self%poisson(pressure, div_u) - call self%backend%allocator%release_block(div_u) + call self%backend%allocator%release_block(div_u) - dpdx => self%backend%allocator%get_block(DIR_X) - dpdy => self%backend%allocator%get_block(DIR_X) - dpdz => self%backend%allocator%get_block(DIR_X) + dpdx => self%backend%allocator%get_block(DIR_X) + dpdy => self%backend%allocator%get_block(DIR_X) + dpdz => self%backend%allocator%get_block(DIR_X) - call self%gradient_p2v(dpdx, dpdy, dpdz, pressure) + call self%gradient_p2v(dpdx, dpdy, dpdz, pressure) - call self%backend%allocator%release_block(pressure) + call self%backend%allocator%release_block(pressure) - ! velocity correction - call self%backend%vecadd(-1._dp, dpdx, 1._dp, self%u) - call self%backend%vecadd(-1._dp, dpdy, 1._dp, self%v) - call self%backend%vecadd(-1._dp, dpdz, 1._dp, self%w) + ! velocity correction + call self%backend%vecadd(-1._dp, dpdx, 1._dp, self%u) + call self%backend%vecadd(-1._dp, dpdy, 1._dp, self%v) + call self%backend%vecadd(-1._dp, dpdz, 1._dp, self%w) - call self%backend%allocator%release_block(dpdx) - call self%backend%allocator%release_block(dpdy) - call self%backend%allocator%release_block(dpdz) + call self%backend%allocator%release_block(dpdx) + call self%backend%allocator%release_block(dpdy) + call self%backend%allocator%release_block(dpdz) - if ( mod(i, self%n_output) == 0 ) then - t = i*self%dt - call self%output(t, u_out) - end if + if ( mod(i, self%n_output) == 0 ) then + t = i*self%dt + call self%output(t, u_out) + end if end do print*, 'run end' @@ -669,6 +669,6 @@ subroutine run(self, u_out, v_out, w_out) call self%backend%get_field_data(v_out, self%v) call self%backend%get_field_data(w_out, self%w) - end subroutine run + end subroutine run -end module m_solver + end module m_solver diff --git a/src/tdsops.f90 b/src/tdsops.f90 index f702a2a5..20df3b7c 100644 --- a/src/tdsops.f90 +++ b/src/tdsops.f90 @@ -1,11 +1,11 @@ -module m_tdsops - use iso_fortran_env, only: stderr => error_unit + module m_tdsops + use iso_fortran_env, only: stderr => error_unit - use m_common, only: dp, pi + use m_common, only: dp, pi - implicit none + implicit none - type :: tdsops_t + type :: tdsops_t !! Tridiagonal Solver Operators class. !! !! Operator arrays are preprocessed in this class based on the arguments @@ -24,36 +24,36 @@ module m_tdsops !! location among other ranks. All the operator arrays here are used when !! executing a distributed tridiagonal solver phase one or two. real(dp), allocatable, dimension(:) :: dist_fw, dist_bw, & !! fw/bw phase - dist_sa, dist_sc, & !! back subs. - dist_af !! the auxiliary factors + dist_sa, dist_sc, & !! back subs. + dist_af !! the auxiliary factors real(dp), allocatable :: coeffs(:), coeffs_s(:, :), coeffs_e(:, :) real(dp) :: alpha, a, b, c = 0._dp, d = 0._dp integer :: n, n_halo - contains + contains procedure :: deriv_1st, deriv_2nd, interpl_mid, stagder_1st, preprocess - end type tdsops_t + end type tdsops_t - interface tdsops_t + interface tdsops_t module procedure tdsops_init - end interface tdsops_t + end interface tdsops_t - type :: dirps_t + type :: dirps_t !! Directional tridiagonal solver container. !! !! This class contains the preprocessed tridiagonal solvers for operating !! in each coordinate direction. class(tdsops_t), allocatable :: der1st, der1st_sym, & - der2nd, der2nd_sym, & - stagder_v2p, stagder_p2v, & - interpl_v2p, interpl_p2v + der2nd, der2nd_sym, & + stagder_v2p, stagder_p2v, & + interpl_v2p, interpl_p2v integer :: nrank, nproc, pnext, pprev, n, n_blocks, dir real(dp) :: L, d - end type dirps_t + end type dirps_t -contains + contains - function tdsops_init(n, delta, operation, scheme, n_halo, from_to, & - bc_start, bc_end, sym, c_nu, nu0_nu) result(tdsops) + function tdsops_init(n, delta, operation, scheme, n_halo, from_to, & + bc_start, bc_end, sym, c_nu, nu0_nu) result(tdsops) !! Constructor function for the tdsops_t class. !! !! 'n', 'delta', 'operation', and 'scheme' are necessary arguments. @@ -88,15 +88,15 @@ function tdsops_init(n, delta, operation, scheme, n_halo, from_to, & tdsops%n = n if (present(n_halo)) then - tdsops%n_halo = n_halo - if (n_halo /= 4) then - write (stderr, '("Warning: n_halo is set to ", i2, "be careful! & - &The default is 4 and there are quite a few & - &places where things are hardcoded assuming & - &n_halo is 4.")') n_halo - end if + tdsops%n_halo = n_halo + if (n_halo /= 4) then + write (stderr, '("Warning: n_halo is set to ", i2, "be careful! & + &The default is 4 and there are quite a few & + &places where things are hardcoded assuming & + &n_halo is 4.")') n_halo + end if else - tdsops%n_halo = 4 + tdsops%n_halo = 4 end if n_stencil = 2*tdsops%n_halo + 1 @@ -109,21 +109,21 @@ function tdsops_init(n, delta, operation, scheme, n_halo, from_to, & allocate(tdsops%coeffs_e(n_stencil, tdsops%n_halo)) if (operation == 'first-deriv') then - call tdsops%deriv_1st(delta, scheme, bc_start, bc_end, sym) + call tdsops%deriv_1st(delta, scheme, bc_start, bc_end, sym) else if (operation == 'second-deriv') then - call tdsops%deriv_2nd(delta, scheme, bc_start, bc_end, sym, & - c_nu, nu0_nu) + call tdsops%deriv_2nd(delta, scheme, bc_start, bc_end, sym, & + c_nu, nu0_nu) else if (operation == 'interpolate') then - call tdsops%interpl_mid(scheme, from_to, bc_start, bc_end, sym) + call tdsops%interpl_mid(scheme, from_to, bc_start, bc_end, sym) else if (operation == 'stag-deriv') then - call tdsops%stagder_1st(delta, scheme, from_to, bc_start, bc_end, sym) + call tdsops%stagder_1st(delta, scheme, from_to, bc_start, bc_end, sym) else - error stop 'operation is not defined' + error stop 'operation is not defined' end if - end function tdsops_init + end function tdsops_init - subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) + subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) implicit none class(tdsops_t), intent(inout) :: self @@ -140,32 +140,32 @@ subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) if (self%n_halo < 2) error stop 'First derivative require n_halo >= 2' if (present(sym)) then - symmetry = sym + symmetry = sym else - symmetry = .false. + symmetry = .false. end if ! alpha is alfa select case (scheme) - case ('compact6') - alpha = 1._dp/3._dp - afi = 7._dp/9._dp/delta - bfi = 1._dp/36._dp/delta - case default - error stop 'scheme is not defined' + case ('compact6') + alpha = 1._dp/3._dp + afi = 7._dp/9._dp/delta + bfi = 1._dp/36._dp/delta + case default + error stop 'scheme is not defined' end select self%alpha = alpha self%a = afi; self%b = bfi self%coeffs(:) = [0._dp, 0._dp, -bfi, -afi, & - 0._dp, & - afi, bfi, 0._dp, 0._dp] + 0._dp, & + afi, bfi, 0._dp, 0._dp] do i = 1, self%n_halo - self%coeffs_s(:, i) = self%coeffs(:) - self%coeffs_e(:, i) = self%coeffs(:) + self%coeffs_s(:, i) = self%coeffs(:) + self%coeffs_e(:, i) = self%coeffs(:) end do self%dist_sa(:) = alpha; self%dist_sc(:) = alpha @@ -177,99 +177,99 @@ subroutine deriv_1st(self, delta, scheme, bc_start, bc_end, sym) dist_b(:) = 1._dp select case (bc_start) - case ('neumann') - if (symmetry) then - ! sym == .true.; d(uu)/dx, dv/dx, dw/dx - ! d(vv)/dy, du/dy, dw/dy - ! d(ww)/dz, du/dz, dv/dz - self%dist_sa(1) = 0._dp - self%dist_sc(1) = 0._dp - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -afi, & - -bfi, & - afi, bfi, 0._dp, 0._dp] - else - ! sym == .false.; d(uv)/dx, d(uw)/dx, du/dx - ! d(vu)/dy, d(vw)/dy, dv/dy - ! d(wu)/dz, d(wv)/dz, dw/dz - self%dist_sa(1) = 0._dp - self%dist_sc(1) = 2*alpha - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - 0._dp, & - 2*afi, 2*bfi, 0._dp, 0._dp] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -afi, & - bfi, & - afi, bfi, 0._dp, 0._dp] - end if - case ('dirichlet') - ! first line - self%dist_sa(1) = 0._dp - self%dist_sc(1) = 2._dp - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - -2.5_dp, & - 2._dp, 0.5_dp, 0._dp, 0._dp] - self%coeffs_s(:, 1) = self%coeffs_s(:, 1)/delta - ! second line - self%dist_sa(2) = 0.25_dp - self%dist_sc(2) = 0.25_dp - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -0.75_dp, & - 0._dp, & - 0.75_dp, 0._dp, 0._dp, 0._dp] - self%coeffs_s(:, 2) = self%coeffs_s(:, 2)/delta + case ('neumann') + if (symmetry) then + ! sym == .true.; d(uu)/dx, dv/dx, dw/dx + ! d(vv)/dy, du/dy, dw/dy + ! d(ww)/dz, du/dz, dv/dz + self%dist_sa(1) = 0._dp + self%dist_sc(1) = 0._dp + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -afi, & + -bfi, & + afi, bfi, 0._dp, 0._dp] + else + ! sym == .false.; d(uv)/dx, d(uw)/dx, du/dx + ! d(vu)/dy, d(vw)/dy, dv/dy + ! d(wu)/dz, d(wv)/dz, dw/dz + self%dist_sa(1) = 0._dp + self%dist_sc(1) = 2*alpha + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + 0._dp, & + 2*afi, 2*bfi, 0._dp, 0._dp] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -afi, & + bfi, & + afi, bfi, 0._dp, 0._dp] + end if + case ('dirichlet') + ! first line + self%dist_sa(1) = 0._dp + self%dist_sc(1) = 2._dp + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + -2.5_dp, & + 2._dp, 0.5_dp, 0._dp, 0._dp] + self%coeffs_s(:, 1) = self%coeffs_s(:, 1)/delta + ! second line + self%dist_sa(2) = 0.25_dp + self%dist_sc(2) = 0.25_dp + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -0.75_dp, & + 0._dp, & + 0.75_dp, 0._dp, 0._dp, 0._dp] + self%coeffs_s(:, 2) = self%coeffs_s(:, 2)/delta end select select case (bc_end) - case ('neumann') - if (symmetry) then - ! sym == .true.; d(uu)/dx, dv/dx, dw/dx - ! d(vv)/dy, du/dy, dw/dy - ! d(ww)/dz, du/dz, dv/dz - self%dist_sa(n) = 0._dp - self%dist_sc(n) = 0._dp - self%coeffs_e(:, n_halo) = [0._dp, 0._dp, 0._dp, 0._dp, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, -bfi, -afi, & - bfi, & - afi, 0._dp, 0._dp, 0._dp] - else - ! sym == .false.; d(uv)/dx, d(uw)/dx, du/dx - ! d(vu)/dy, d(vw)/dy, dv/dy - ! d(wu)/dz, d(wv)/dz, dw/dz - self%dist_sa(n) = 2*alpha - self%dist_sc(n) = 0._dp - self%coeffs_e(:, n_halo) = [0._dp, 0._dp, -2*bfi, -2*afi, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, -bfi, -afi, & - -bfi, & - afi, 0._dp, 0._dp, 0._dp] - end if - case ('dirichlet') - ! last line - self%dist_sa(n) = 2._dp - self%dist_sc(n) = 0._dp - self%coeffs_e(:, n_halo) = [0._dp, 0._dp, -0.5_dp, -2._dp, & - 2.5_dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, n_halo) = self%coeffs_e(:, n_halo)/delta - ! second last line - self%dist_sa(n - 1) = 0.25_dp - self%dist_sc(n - 1) = 0.25_dp - self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, 0._dp, -0.75_dp, & - 0._dp, & - 0.75_dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, n_halo - 1) = self%coeffs_e(:, n_halo - 1)/delta + case ('neumann') + if (symmetry) then + ! sym == .true.; d(uu)/dx, dv/dx, dw/dx + ! d(vv)/dy, du/dy, dw/dy + ! d(ww)/dz, du/dz, dv/dz + self%dist_sa(n) = 0._dp + self%dist_sc(n) = 0._dp + self%coeffs_e(:, n_halo) = [0._dp, 0._dp, 0._dp, 0._dp, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, -bfi, -afi, & + bfi, & + afi, 0._dp, 0._dp, 0._dp] + else + ! sym == .false.; d(uv)/dx, d(uw)/dx, du/dx + ! d(vu)/dy, d(vw)/dy, dv/dy + ! d(wu)/dz, d(wv)/dz, dw/dz + self%dist_sa(n) = 2*alpha + self%dist_sc(n) = 0._dp + self%coeffs_e(:, n_halo) = [0._dp, 0._dp, -2*bfi, -2*afi, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, -bfi, -afi, & + -bfi, & + afi, 0._dp, 0._dp, 0._dp] + end if + case ('dirichlet') + ! last line + self%dist_sa(n) = 2._dp + self%dist_sc(n) = 0._dp + self%coeffs_e(:, n_halo) = [0._dp, 0._dp, -0.5_dp, -2._dp, & + 2.5_dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, n_halo) = self%coeffs_e(:, n_halo)/delta + ! second last line + self%dist_sa(n - 1) = 0.25_dp + self%dist_sc(n - 1) = 0.25_dp + self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, 0._dp, -0.75_dp, & + 0._dp, & + 0.75_dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, n_halo - 1) = self%coeffs_e(:, n_halo - 1)/delta end select call self%preprocess(dist_b) - end subroutine deriv_1st + end subroutine deriv_1st - subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & - c_nu, nu0_nu) + subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & + c_nu, nu0_nu) implicit none class(tdsops_t), intent(inout) :: self @@ -288,9 +288,9 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & if (self%n_halo < 4) error stop 'Second derivative require n_halo >= 4' if (present(sym)) then - symmetry = sym + symmetry = sym else - symmetry = .false. + symmetry = .false. end if d2 = delta*delta @@ -298,44 +298,44 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & ! alpha is alsa select case (scheme) - case ('compact6') - alpha = 2._dp/11._dp - asi = 12._dp/11._dp/d2 - bsi = 3._dp/44._dp/d2 - csi = 0._dp - dsi = 0._dp - case ('compact6-hyperviscous') - if (present(c_nu) .and. present(nu0_nu)) then - dpis3 = 2._dp*pi/3._dp - xnpi2 = pi*pi*(1._dp + nu0_nu) - xmpi2 = dpis3*dpis3*(1._dp + c_nu*nu0_nu) - den = 405._dp*xnpi2 - 640._dp*xmpi2 + 144._dp - alpha = 0.5_dp - (320._dp*xmpi2 - 1296._dp)/den - asi = -(4329._dp*xnpi2/8._dp - 32._dp*xmpi2 & - - 140._dp*xnpi2*xmpi2 + 286._dp)/den/d2 - bsi = (2115._dp*xnpi2 - 1792._dp*xmpi2 & - - 280._dp*xnpi2*xmpi2 + 1328._dp)/den/(4._dp*d2) - csi = -(7695._dp*xnpi2/8._dp + 288._dp*xmpi2 & - - 180._dp*xnpi2*xmpi2 - 2574._dp)/den/(9._dp*d2) - dsi = (198._dp*xnpi2 + 128._dp*xmpi2 & - - 40._dp*xnpi2*xmpi2 - 736._dp)/den/(16._dp*d2) - else - error stop 'compact6-hyperviscous requires c_nu and nu0_nu' - end if - case default - error stop 'scheme is not defined' + case ('compact6') + alpha = 2._dp/11._dp + asi = 12._dp/11._dp/d2 + bsi = 3._dp/44._dp/d2 + csi = 0._dp + dsi = 0._dp + case ('compact6-hyperviscous') + if (present(c_nu) .and. present(nu0_nu)) then + dpis3 = 2._dp*pi/3._dp + xnpi2 = pi*pi*(1._dp + nu0_nu) + xmpi2 = dpis3*dpis3*(1._dp + c_nu*nu0_nu) + den = 405._dp*xnpi2 - 640._dp*xmpi2 + 144._dp + alpha = 0.5_dp - (320._dp*xmpi2 - 1296._dp)/den + asi = -(4329._dp*xnpi2/8._dp - 32._dp*xmpi2 & + - 140._dp*xnpi2*xmpi2 + 286._dp)/den/d2 + bsi = (2115._dp*xnpi2 - 1792._dp*xmpi2 & + - 280._dp*xnpi2*xmpi2 + 1328._dp)/den/(4._dp*d2) + csi = -(7695._dp*xnpi2/8._dp + 288._dp*xmpi2 & + - 180._dp*xnpi2*xmpi2 - 2574._dp)/den/(9._dp*d2) + dsi = (198._dp*xnpi2 + 128._dp*xmpi2 & + - 40._dp*xnpi2*xmpi2 - 736._dp)/den/(16._dp*d2) + else + error stop 'compact6-hyperviscous requires c_nu and nu0_nu' + end if + case default + error stop 'scheme is not defined' end select self%alpha = alpha self%a = asi; self%b = bsi; self%c = csi; self%d = dsi self%coeffs(:) = [dsi, csi, bsi, asi, & - -2._dp*(asi + bsi + csi + dsi), & - asi, bsi, csi, dsi] + -2._dp*(asi + bsi + csi + dsi), & + asi, bsi, csi, dsi] do i = 1, self%n_halo - self%coeffs_s(:, i) = self%coeffs(:) - self%coeffs_e(:, i) = self%coeffs(:) + self%coeffs_s(:, i) = self%coeffs(:) + self%coeffs_e(:, i) = self%coeffs(:) end do self%dist_sa(:) = alpha; self%dist_sc(:) = alpha @@ -347,140 +347,140 @@ subroutine deriv_2nd(self, delta, scheme, bc_start, bc_end, sym, & dist_b(:) = 1._dp select case (bc_start) - case ('neumann') - if (symmetry) then - ! sym == .true.; d2v/dx2, d2w/dx2 - ! d2u/dy2, d2w/dy2 - ! d2u/dz2, d2v/dz2 - self%dist_sa(1) = 0._dp - self%dist_sc(1) = 2*alpha - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - -2*asi - 2*bsi - 2*csi - 2*dsi, & - 2*asi, 2*bsi, 2*csi, 2*dsi] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, asi, & - -2*asi - bsi - 2*csi - 2*dsi, & - asi + csi, bsi + dsi, csi, dsi] - self%coeffs_s(:, 3) = [0._dp, 0._dp, bsi, asi + csi, & - -2*asi - 2*bsi - 2*csi - dsi, & - asi, bsi, csi, dsi] - self%coeffs_s(:, 4) = [0._dp, csi, bsi + dsi, asi, & - -2*asi - 2*bsi - 2*csi - 2*dsi, & - asi, bsi, csi, dsi] - else - ! sym == .false.; d2u/dx2 - ! d2v/dy2 - ! d2w/dz2 - self%dist_sa(1) = 0._dp - self%dist_sc(1) = 0._dp - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, asi, & - -2*asi - 3*bsi - 2*csi - 2*dsi, & - asi - csi, bsi - dsi, csi, dsi] - self%coeffs_s(:, 3) = [0._dp, 0._dp, bsi, asi - csi, & - -2*asi - 2*bsi - 2*csi - 3*dsi, & - asi, bsi, csi, dsi] - self%coeffs_s(:, 4) = [0._dp, -csi, bsi - dsi, asi, & - -2*asi - 2*bsi - 2*csi - 2*dsi, & - asi, bsi, csi, dsi] - end if - case ('dirichlet') - ! first line - self%dist_sa(1) = 0._dp - self%dist_sc(1) = 11._dp - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - 13._dp/d2, & - -27._dp/d2, 15._dp/d2, -1._dp/d2, 0._dp] - ! second line - self%dist_sa(2) = 0.1_dp - self%dist_sc(2) = 0.1_dp - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, 1.2_dp/d2, & - -2.4_dp/d2, & - 1.2_dp/d2, 0._dp, 0._dp, 0._dp] - ! third line - self%dist_sa(3) = 2._dp/11._dp - self%dist_sc(3) = 2._dp/11._dp - temp1 = 3._dp/44._dp/d2; temp2 = 12._dp/11._dp/d2 - self%coeffs_s(:, 3) = [0._dp, 0._dp, temp1, temp2, & - -2._dp*(temp1 + temp2), & - temp2, temp1, 0._dp, 0._dp] - ! fourth line is same as third - self%dist_sa(4) = 2._dp/11._dp - self%dist_sc(4) = 2._dp/11._dp - self%coeffs_s(:, 4) = self%coeffs_s(:, 3) + case ('neumann') + if (symmetry) then + ! sym == .true.; d2v/dx2, d2w/dx2 + ! d2u/dy2, d2w/dy2 + ! d2u/dz2, d2v/dz2 + self%dist_sa(1) = 0._dp + self%dist_sc(1) = 2*alpha + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + -2*asi - 2*bsi - 2*csi - 2*dsi, & + 2*asi, 2*bsi, 2*csi, 2*dsi] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, asi, & + -2*asi - bsi - 2*csi - 2*dsi, & + asi + csi, bsi + dsi, csi, dsi] + self%coeffs_s(:, 3) = [0._dp, 0._dp, bsi, asi + csi, & + -2*asi - 2*bsi - 2*csi - dsi, & + asi, bsi, csi, dsi] + self%coeffs_s(:, 4) = [0._dp, csi, bsi + dsi, asi, & + -2*asi - 2*bsi - 2*csi - 2*dsi, & + asi, bsi, csi, dsi] + else + ! sym == .false.; d2u/dx2 + ! d2v/dy2 + ! d2w/dz2 + self%dist_sa(1) = 0._dp + self%dist_sc(1) = 0._dp + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, asi, & + -2*asi - 3*bsi - 2*csi - 2*dsi, & + asi - csi, bsi - dsi, csi, dsi] + self%coeffs_s(:, 3) = [0._dp, 0._dp, bsi, asi - csi, & + -2*asi - 2*bsi - 2*csi - 3*dsi, & + asi, bsi, csi, dsi] + self%coeffs_s(:, 4) = [0._dp, -csi, bsi - dsi, asi, & + -2*asi - 2*bsi - 2*csi - 2*dsi, & + asi, bsi, csi, dsi] + end if + case ('dirichlet') + ! first line + self%dist_sa(1) = 0._dp + self%dist_sc(1) = 11._dp + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + 13._dp/d2, & + -27._dp/d2, 15._dp/d2, -1._dp/d2, 0._dp] + ! second line + self%dist_sa(2) = 0.1_dp + self%dist_sc(2) = 0.1_dp + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, 1.2_dp/d2, & + -2.4_dp/d2, & + 1.2_dp/d2, 0._dp, 0._dp, 0._dp] + ! third line + self%dist_sa(3) = 2._dp/11._dp + self%dist_sc(3) = 2._dp/11._dp + temp1 = 3._dp/44._dp/d2; temp2 = 12._dp/11._dp/d2 + self%coeffs_s(:, 3) = [0._dp, 0._dp, temp1, temp2, & + -2._dp*(temp1 + temp2), & + temp2, temp1, 0._dp, 0._dp] + ! fourth line is same as third + self%dist_sa(4) = 2._dp/11._dp + self%dist_sc(4) = 2._dp/11._dp + self%coeffs_s(:, 4) = self%coeffs_s(:, 3) end select select case (bc_end) - case ('neumann') - if (symmetry) then - ! sym == .true.; d2v/dx2, d2w/dx2 - ! d2u/dy2, d2w/dy2 - ! d2u/dz2, d2v/dz2 - self%dist_sa(n) = 2*alpha - self%dist_sc(n) = 0._dp - self%coeffs_e(:, 4) = [dsi, csi, bsi, asi, & - -2*asi - 2*bsi - 2*csi - 2*dsi, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 3) = [dsi, csi, bsi + dsi, asi + csi, & - -2*asi - bsi - 2*csi - 2*dsi, & - asi, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 2) = [dsi, csi, bsi, asi, & - -2*asi - 2*bsi - 2*csi - dsi, & - asi + csi, bsi, 0._dp, 0._dp] - self%coeffs_e(:, 1) = [dsi, csi, bsi, asi, & - -2*asi - 2*bsi - 2*csi - 2*dsi, & - asi, bsi + dsi, csi, 0._dp] - else - ! sym == .false.; d2u/dx2 - ! d2v/dy2 - ! d2w/dz2 - self%dist_sa(n) = 0._dp - self%dist_sc(n) = 0._dp - self%coeffs_e(:, 4) = [0._dp, 0._dp, 0._dp, 0._dp, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 3) = [dsi, csi, bsi - dsi, asi - csi, & - -2*asi - 3*bsi - 2*csi - 2*dsi, & - asi, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 2) = [dsi, csi, bsi, asi, & - -2*asi - 2*bsi - 2*csi - 3*dsi, & - asi - csi, bsi, 0._dp, 0._dp] - self%coeffs_e(:, 1) = [dsi, csi, bsi, asi, & - -2*asi - 2*bsi - 2*csi - 2*dsi, & - asi, bsi - dsi, -csi, 0._dp] - end if - case ('dirichlet') - ! last line - self%dist_sa(n) = 11._dp - self%dist_sc(n) = 0._dp - self%coeffs_e(:, 4) = [0._dp, -1._dp/d2, 15._dp/d2, -27._dp/d2, & - 13._dp/d2, & - 0._dp, 0._dp, 0._dp, 0._dp] - ! second last line - self%dist_sa(n - 1) = 0.1_dp - self%dist_sc(n - 1) = 0.1_dp - self%coeffs_e(:, 3) = [0._dp, 0._dp, 0._dp, 1.2_dp/d2, & - -2.4_dp/d2, & - 1.2_dp/d2, 0._dp, 0._dp, 0._dp] - ! third last line - self%dist_sa(n - 2) = 2._dp/11._dp - self%dist_sc(n - 2) = 2._dp/11._dp - temp1 = 3._dp/44._dp/d2; temp2 = 12._dp/11._dp/d2 - self%coeffs_e(:, 2) = [0._dp, 0._dp, temp1, temp2, & - -2._dp*(temp1 + temp2), & - temp2, temp1, 0._dp, 0._dp] - ! fourth last line is same as third last - self%dist_sa(n - 3) = 2._dp/11._dp - self%dist_sc(n - 3) = 2._dp/11._dp - self%coeffs_e(:, 1) = self%coeffs_e(:, 2) + case ('neumann') + if (symmetry) then + ! sym == .true.; d2v/dx2, d2w/dx2 + ! d2u/dy2, d2w/dy2 + ! d2u/dz2, d2v/dz2 + self%dist_sa(n) = 2*alpha + self%dist_sc(n) = 0._dp + self%coeffs_e(:, 4) = [dsi, csi, bsi, asi, & + -2*asi - 2*bsi - 2*csi - 2*dsi, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 3) = [dsi, csi, bsi + dsi, asi + csi, & + -2*asi - bsi - 2*csi - 2*dsi, & + asi, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 2) = [dsi, csi, bsi, asi, & + -2*asi - 2*bsi - 2*csi - dsi, & + asi + csi, bsi, 0._dp, 0._dp] + self%coeffs_e(:, 1) = [dsi, csi, bsi, asi, & + -2*asi - 2*bsi - 2*csi - 2*dsi, & + asi, bsi + dsi, csi, 0._dp] + else + ! sym == .false.; d2u/dx2 + ! d2v/dy2 + ! d2w/dz2 + self%dist_sa(n) = 0._dp + self%dist_sc(n) = 0._dp + self%coeffs_e(:, 4) = [0._dp, 0._dp, 0._dp, 0._dp, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 3) = [dsi, csi, bsi - dsi, asi - csi, & + -2*asi - 3*bsi - 2*csi - 2*dsi, & + asi, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 2) = [dsi, csi, bsi, asi, & + -2*asi - 2*bsi - 2*csi - 3*dsi, & + asi - csi, bsi, 0._dp, 0._dp] + self%coeffs_e(:, 1) = [dsi, csi, bsi, asi, & + -2*asi - 2*bsi - 2*csi - 2*dsi, & + asi, bsi - dsi, -csi, 0._dp] + end if + case ('dirichlet') + ! last line + self%dist_sa(n) = 11._dp + self%dist_sc(n) = 0._dp + self%coeffs_e(:, 4) = [0._dp, -1._dp/d2, 15._dp/d2, -27._dp/d2, & + 13._dp/d2, & + 0._dp, 0._dp, 0._dp, 0._dp] + ! second last line + self%dist_sa(n - 1) = 0.1_dp + self%dist_sc(n - 1) = 0.1_dp + self%coeffs_e(:, 3) = [0._dp, 0._dp, 0._dp, 1.2_dp/d2, & + -2.4_dp/d2, & + 1.2_dp/d2, 0._dp, 0._dp, 0._dp] + ! third last line + self%dist_sa(n - 2) = 2._dp/11._dp + self%dist_sc(n - 2) = 2._dp/11._dp + temp1 = 3._dp/44._dp/d2; temp2 = 12._dp/11._dp/d2 + self%coeffs_e(:, 2) = [0._dp, 0._dp, temp1, temp2, & + -2._dp*(temp1 + temp2), & + temp2, temp1, 0._dp, 0._dp] + ! fourth last line is same as third last + self%dist_sa(n - 3) = 2._dp/11._dp + self%dist_sc(n - 3) = 2._dp/11._dp + self%coeffs_e(:, 1) = self%coeffs_e(:, 2) end select call self%preprocess(dist_b) - end subroutine deriv_2nd + end subroutine deriv_2nd - subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) + subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) implicit none class(tdsops_t), intent(inout) :: self @@ -497,45 +497,45 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) ! alpha is ailcai select case (scheme) - case ('classic') - alpha = 0.3_dp - aici = 0.75_dp - bici = 0.05_dp - cici = 0._dp - dici = 0._dp - case ('optimised') - alpha = 0.461658_dp - dici = 0.00146508_dp - aici = (75._dp + 70._dp*alpha - 640._dp*dici)/128._dp - bici = (-25._dp + 126._dp*alpha + 2304._dp*dici)/256._dp - cici = (3._dp - 10._dp*alpha - 1280._dp*dici)/256._dp - case ('aggressive') - alpha = 0.49_dp - aici = (75._dp + 70._dp*alpha)/128._dp - bici = (-25._dp + 126._dp*alpha)/256._dp - cici = (3._dp - 10._dp*alpha)/256._dp - dici = 0._dp - case default - error stop 'scheme is not defined' + case ('classic') + alpha = 0.3_dp + aici = 0.75_dp + bici = 0.05_dp + cici = 0._dp + dici = 0._dp + case ('optimised') + alpha = 0.461658_dp + dici = 0.00146508_dp + aici = (75._dp + 70._dp*alpha - 640._dp*dici)/128._dp + bici = (-25._dp + 126._dp*alpha + 2304._dp*dici)/256._dp + cici = (3._dp - 10._dp*alpha - 1280._dp*dici)/256._dp + case ('aggressive') + alpha = 0.49_dp + aici = (75._dp + 70._dp*alpha)/128._dp + bici = (-25._dp + 126._dp*alpha)/256._dp + cici = (3._dp - 10._dp*alpha)/256._dp + dici = 0._dp + case default + error stop 'scheme is not defined' end select self%alpha = alpha self%a = aici; self%b = bici; self%c = cici; self%d = dici select case (from_to) - case ('v2p') - self%coeffs(:) = [0._dp, dici, cici, bici, & - aici, & - aici, bici, cici, dici] - case ('p2v') - self%coeffs(:) = [dici, cici, bici, aici, & - aici, & - bici, cici, dici, 0._dp] + case ('v2p') + self%coeffs(:) = [0._dp, dici, cici, bici, & + aici, & + aici, bici, cici, dici] + case ('p2v') + self%coeffs(:) = [dici, cici, bici, aici, & + aici, & + bici, cici, dici, 0._dp] end select do i = 1, self%n_halo - self%coeffs_s(:, i) = self%coeffs(:) - self%coeffs_e(:, i) = self%coeffs(:) + self%coeffs_s(:, i) = self%coeffs(:) + self%coeffs_e(:, i) = self%coeffs(:) end do self%dist_sa(:) = alpha; self%dist_sc(:) = alpha @@ -547,78 +547,78 @@ subroutine interpl_mid(self, scheme, from_to, bc_start, bc_end, sym) dist_b(:) = 1._dp if ((bc_start == 'dirichlet') .or. (bc_start == 'neumann')) then - self%dist_sa(1) = 0._dp + self%dist_sa(1) = 0._dp - select case (from_to) + select case (from_to) case ('v2p') - ! sym is always .true. - dist_b(1) = 1._dp + alpha - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - aici, & - aici + bici, bici + cici, cici + dici, dici] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, bici, & - aici + cici, & - aici + dici, bici, cici, dici] - self%coeffs_s(:, 3) = [0._dp, 0._dp, cici, bici + dici, & - aici, & - aici, bici, cici, dici] + ! sym is always .true. + dist_b(1) = 1._dp + alpha + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + aici, & + aici + bici, bici + cici, cici + dici, dici] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, bici, & + aici + cici, & + aici + dici, bici, cici, dici] + self%coeffs_s(:, 3) = [0._dp, 0._dp, cici, bici + dici, & + aici, & + aici, bici, cici, dici] case ('p2v') - ! sym is always .true. - self%dist_sc(1) = 2*alpha - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - 2*aici, & - 2*bici, 2*cici, 2*dici, 0._dp] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, aici + bici, & - aici + cici, & - bici + dici, cici, dici, 0._dp] - self%coeffs_s(:, 3) = [0._dp, 0._dp, bici + cici, aici + dici, & - aici, & - bici, cici, dici, 0._dp] - self%coeffs_s(:, 4) = [0._dp, cici + dici, bici, aici, & - aici, & - bici, cici, dici, 0._dp] - end select + ! sym is always .true. + self%dist_sc(1) = 2*alpha + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + 2*aici, & + 2*bici, 2*cici, 2*dici, 0._dp] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, aici + bici, & + aici + cici, & + bici + dici, cici, dici, 0._dp] + self%coeffs_s(:, 3) = [0._dp, 0._dp, bici + cici, aici + dici, & + aici, & + bici, cici, dici, 0._dp] + self%coeffs_s(:, 4) = [0._dp, cici + dici, bici, aici, & + aici, & + bici, cici, dici, 0._dp] + end select end if if ((bc_end == 'dirichlet') .or. (bc_end == 'neumann')) then - self%dist_sc(n) = 0._dp + self%dist_sc(n) = 0._dp - select case (from_to) + select case (from_to) case ('v2p') - ! sym is always .true. - dist_b(n) = 1._dp + alpha - self%coeffs_e(:, 4) = [0._dp, dici, cici + dici, bici + cici, & - aici + bici, & - aici, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 3) = [0._dp, dici, cici, bici, & - aici + dici, & - aici + cici, bici, 0._dp, 0._dp] - self%coeffs_e(:, 2) = [0._dp, dici, cici, bici, & - aici, & - aici, bici + dici, cici, 0._dp] + ! sym is always .true. + dist_b(n) = 1._dp + alpha + self%coeffs_e(:, 4) = [0._dp, dici, cici + dici, bici + cici, & + aici + bici, & + aici, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 3) = [0._dp, dici, cici, bici, & + aici + dici, & + aici + cici, bici, 0._dp, 0._dp] + self%coeffs_e(:, 2) = [0._dp, dici, cici, bici, & + aici, & + aici, bici + dici, cici, 0._dp] case ('p2v') - ! sym is always .true. - self%dist_sa(n) = 2*alpha - self%coeffs_e(:, 4) = [2*dici, 2*cici, 2*bici, 2*aici, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 3) = [dici, cici, bici + dici, aici + cici, & - aici + bici, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 2) = [dici, cici, bici, aici, & - aici + dici, & - bici + cici, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, 1) = [dici, cici, bici, aici, & - aici, & - bici, cici + dici, 0._dp, 0._dp] - end select + ! sym is always .true. + self%dist_sa(n) = 2*alpha + self%coeffs_e(:, 4) = [2*dici, 2*cici, 2*bici, 2*aici, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 3) = [dici, cici, bici + dici, aici + cici, & + aici + bici, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 2) = [dici, cici, bici, aici, & + aici + dici, & + bici + cici, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, 1) = [dici, cici, bici, aici, & + aici, & + bici, cici + dici, 0._dp, 0._dp] + end select end if call self%preprocess(dist_b) - end subroutine interpl_mid + end subroutine interpl_mid - subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) + subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) implicit none class(tdsops_t), intent(inout) :: self @@ -636,31 +636,31 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) ! alpha is alcai select case (scheme) - case ('compact6') - alpha = 9._dp/62._dp - aci = 63._dp/62._dp/delta - bci = 17._dp/62._dp/3._dp/delta - case default - error stop 'scheme is not defined' + case ('compact6') + alpha = 9._dp/62._dp + aci = 63._dp/62._dp/delta + bci = 17._dp/62._dp/3._dp/delta + case default + error stop 'scheme is not defined' end select self%alpha = alpha self%a = aci; self%b = bci select case (from_to) - case ('v2p') - self%coeffs(:) = [0._dp, 0._dp, 0._dp, -bci, & - -aci, & - aci, bci, 0._dp, 0._dp] - case ('p2v') - self%coeffs(:) = [0._dp, 0._dp, -bci, -aci, & - aci, & - bci, 0._dp, 0._dp, 0._dp] + case ('v2p') + self%coeffs(:) = [0._dp, 0._dp, 0._dp, -bci, & + -aci, & + aci, bci, 0._dp, 0._dp] + case ('p2v') + self%coeffs(:) = [0._dp, 0._dp, -bci, -aci, & + aci, & + bci, 0._dp, 0._dp, 0._dp] end select do i = 1, self%n_halo - self%coeffs_s(:, i) = self%coeffs(:) - self%coeffs_e(:, i) = self%coeffs(:) + self%coeffs_s(:, i) = self%coeffs(:) + self%coeffs_e(:, i) = self%coeffs(:) end do self%dist_sa(:) = alpha; self%dist_sc(:) = alpha @@ -672,60 +672,60 @@ subroutine stagder_1st(self, delta, scheme, from_to, bc_start, bc_end, sym) dist_b(:) = 1._dp if ((bc_start == 'dirichlet') .or. (bc_start == 'neumann')) then - self%dist_sa(1) = 0._dp + self%dist_sa(1) = 0._dp - select case (from_to) + select case (from_to) case ('v2p') - ! sym is always .false. - dist_b(1) = 1._dp + alpha - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - -aci - 2*bci, & - aci + bci, bci, 0._dp, 0._dp] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -bci, & - -aci, & - aci, bci, 0._dp, 0._dp] + ! sym is always .false. + dist_b(1) = 1._dp + alpha + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + -aci - 2*bci, & + aci + bci, bci, 0._dp, 0._dp] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -bci, & + -aci, & + aci, bci, 0._dp, 0._dp] case ('p2v') - ! sym is always .true. - self%dist_sc(1) = 0._dp - self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -aci - bci, & - aci, & - bci, 0._dp, 0._dp, 0._dp] - end select + ! sym is always .true. + self%dist_sc(1) = 0._dp + self%coeffs_s(:, 1) = [0._dp, 0._dp, 0._dp, 0._dp, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_s(:, 2) = [0._dp, 0._dp, 0._dp, -aci - bci, & + aci, & + bci, 0._dp, 0._dp, 0._dp] + end select end if if ((bc_end == 'dirichlet') .or. (bc_end == 'neumann')) then - self%dist_sc(n) = 0._dp + self%dist_sc(n) = 0._dp - select case (from_to) + select case (from_to) case ('v2p') - ! sym is always .false. - dist_b(n) = 1._dp + alpha - self%coeffs_e(:, n_halo) = [0._dp, 0._dp, 0._dp, -bci, & - -aci - bci, & - aci + 2*bci, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, 0._dp, -bci, & - -aci, & - aci, bci, 0._dp, 0._dp] + ! sym is always .false. + dist_b(n) = 1._dp + alpha + self%coeffs_e(:, n_halo) = [0._dp, 0._dp, 0._dp, -bci, & + -aci - bci, & + aci + 2*bci, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, 0._dp, -bci, & + -aci, & + aci, bci, 0._dp, 0._dp] case ('p2v') - ! sym is always .true. - self%dist_sa(n) = 0._dp - self%coeffs_e(:, n_halo) = [0._dp, 0._dp, 0._dp, 0._dp, & - 0._dp, & - 0._dp, 0._dp, 0._dp, 0._dp] - self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, -bci, -aci, & - aci + bci, & - 0._dp, 0._dp, 0._dp, 0._dp] - end select + ! sym is always .true. + self%dist_sa(n) = 0._dp + self%coeffs_e(:, n_halo) = [0._dp, 0._dp, 0._dp, 0._dp, & + 0._dp, & + 0._dp, 0._dp, 0._dp, 0._dp] + self%coeffs_e(:, n_halo - 1) = [0._dp, 0._dp, -bci, -aci, & + aci + bci, & + 0._dp, 0._dp, 0._dp, 0._dp] + end select end if call self%preprocess(dist_b) - end subroutine stagder_1st + end subroutine stagder_1st - subroutine preprocess(self, dist_b) + subroutine preprocess(self, dist_b) implicit none class(tdsops_t), intent(inout) :: self @@ -738,35 +738,35 @@ subroutine preprocess(self, dist_b) ! Algorithm 3 in page 4 ! First two lines first do i = 1, 2 - self%dist_sa(i) = self%dist_sa(i)/dist_b(i) - self%dist_sc(i) = self%dist_sc(i)/dist_b(i) - self%dist_bw(i) = self%dist_sc(i) - self%dist_af(i) = 1._dp/dist_b(i) + self%dist_sa(i) = self%dist_sa(i)/dist_b(i) + self%dist_sc(i) = self%dist_sc(i)/dist_b(i) + self%dist_bw(i) = self%dist_sc(i) + self%dist_af(i) = 1._dp/dist_b(i) end do ! Then the remaining in the forward pass do i = 3, self%n - ! Algorithm 3 in ref obtains 'r' coeffs on the fly in line 7. - ! As we have to solve many RHSs with the same tridiagonal system, - ! it is better to do a preprocessing first. - ! So lets store 'r' coeff in dist_fw array. - self%dist_fw(i) = 1._dp/(dist_b(i) & - - self%dist_sa(i)*self%dist_sc(i - 1)) - ! dist_af is 'a_i' in line 7 of Algorithm 3 in ref. - self%dist_af(i) = self%dist_sa(i) - ! We store a_i^* and c_i^* in dist_sa and dist_sc because - ! we need them later in the substitution phase. - self%dist_sa(i) = -self%dist_fw(i)*self%dist_sa(i) & - *self%dist_sa(i - 1) - self%dist_sc(i) = self%dist_fw(i)*self%dist_sc(i) + ! Algorithm 3 in ref obtains 'r' coeffs on the fly in line 7. + ! As we have to solve many RHSs with the same tridiagonal system, + ! it is better to do a preprocessing first. + ! So lets store 'r' coeff in dist_fw array. + self%dist_fw(i) = 1._dp/(dist_b(i) & + - self%dist_sa(i)*self%dist_sc(i - 1)) + ! dist_af is 'a_i' in line 7 of Algorithm 3 in ref. + self%dist_af(i) = self%dist_sa(i) + ! We store a_i^* and c_i^* in dist_sa and dist_sc because + ! we need them later in the substitution phase. + self%dist_sa(i) = -self%dist_fw(i)*self%dist_sa(i) & + *self%dist_sa(i - 1) + self%dist_sc(i) = self%dist_fw(i)*self%dist_sc(i) end do ! backward pass starting in line 12 of Algorithm 3. do i = self%n - 2, 2, -1 - self%dist_sa(i) = self%dist_sa(i) & - - self%dist_sc(i)*self%dist_sa(i + 1) - self%dist_bw(i) = self%dist_sc(i) - self%dist_sc(i) = -self%dist_sc(i)*self%dist_sc(i + 1) + self%dist_sa(i) = self%dist_sa(i) & + - self%dist_sc(i)*self%dist_sa(i + 1) + self%dist_bw(i) = self%dist_sc(i) + self%dist_sc(i) = -self%dist_sc(i)*self%dist_sc(i + 1) end do ! Line 17 and 18 are tricky @@ -782,7 +782,7 @@ subroutine preprocess(self, dist_b) self%dist_sa(1) = self%dist_fw(1)*self%dist_sa(1) self%dist_sc(1) = -self%dist_fw(1)*self%dist_sc(1)*self%dist_sc(2) - end subroutine preprocess + end subroutine preprocess -end module m_tdsops + end module m_tdsops diff --git a/src/time_integrator.f90 b/src/time_integrator.f90 index 98195d21..959d8dc1 100644 --- a/src/time_integrator.f90 +++ b/src/time_integrator.f90 @@ -1,26 +1,26 @@ -module m_time_integrator - use m_allocator, only: allocator_t, field_t, flist_t - use m_base_backend, only: base_backend_t - use m_common, only: dp, DIR_X + module m_time_integrator + use m_allocator, only: allocator_t, field_t, flist_t + use m_base_backend, only: base_backend_t + use m_common, only: dp, DIR_X - implicit none + implicit none - type :: time_intg_t + type :: time_intg_t integer :: istep, nsteps, nsubsteps, order, nvars, nolds type(flist_t), allocatable :: olds(:,:) class(base_backend_t), pointer :: backend class(allocator_t), pointer :: allocator - contains + contains procedure :: step - end type time_intg_t + end type time_intg_t - interface time_intg_t + interface time_intg_t module procedure constructor - end interface time_intg_t + end interface time_intg_t -contains + contains - function constructor(backend, allocator, nvars) + function constructor(backend, allocator, nvars) implicit none type(time_intg_t) :: constructor @@ -34,9 +34,9 @@ function constructor(backend, allocator, nvars) constructor%allocator => allocator if (present(nvars)) then - constructor%nvars = nvars + constructor%nvars = nvars else - constructor%nvars = 3 + constructor%nvars = 3 end if constructor%nolds = 0 @@ -45,14 +45,14 @@ function constructor(backend, allocator, nvars) ! Request all the storage for old timesteps do i = 1, constructor%nvars - do j = 1, constructor%nolds - constructor%olds(i, j)%ptr => allocator%get_block(DIR_X) - end do + do j = 1, constructor%nolds + constructor%olds(i, j)%ptr => allocator%get_block(DIR_X) + end do end do - end function constructor + end function constructor - subroutine step(self, u, v, w, du, dv, dw, dt) + subroutine step(self, u, v, w, du, dv, dw, dt) implicit none class(time_intg_t), intent(in) :: self @@ -65,13 +65,13 @@ subroutine step(self, u, v, w, du, dv, dw, dt) call self%backend%vecadd(dt, dv, 1._dp, v) call self%backend%vecadd(dt, dw, 1._dp, w) - end subroutine step + end subroutine step - subroutine adams_bashford_1st(vels, olds, coeffs) + subroutine adams_bashford_1st(vels, olds, coeffs) type(flist_t) :: vels(:), olds(:) real :: coeffs(:) !call vec_add(vels, olds, coeffs) - end subroutine adams_bashford_1st + end subroutine adams_bashford_1st -end module m_time_integrator + end module m_time_integrator diff --git a/src/xcompact.f90 b/src/xcompact.f90 index 7a976a97..a84a904b 100644 --- a/src/xcompact.f90 +++ b/src/xcompact.f90 @@ -1,158 +1,158 @@ -program xcompact - use mpi - - use m_allocator - use m_base_backend - use m_common, only: pi, globs_t, set_pprev_pnext, & - POISSON_SOLVER_FFT, POISSON_SOLVER_CG, & - DIR_X, DIR_Y, DIR_Z - use m_solver, only: solver_t - use m_time_integrator, only: time_intg_t - use m_tdsops, only: tdsops_t + program xcompact + use mpi + + use m_allocator + use m_base_backend + use m_common, only: pi, globs_t, set_pprev_pnext, & + POISSON_SOLVER_FFT, POISSON_SOLVER_CG, & + DIR_X, DIR_Y, DIR_Z + use m_solver, only: solver_t + use m_time_integrator, only: time_intg_t + use m_tdsops, only: tdsops_t #ifdef CUDA - use m_cuda_allocator - use m_cuda_backend - use m_cuda_common, only: SZ - use m_cuda_tdsops, only: cuda_tdsops_t + use m_cuda_allocator + use m_cuda_backend + use m_cuda_common, only: SZ + use m_cuda_tdsops, only: cuda_tdsops_t #else - use m_omp_backend - use m_omp_common, only: SZ + use m_omp_backend + use m_omp_common, only: SZ #endif - implicit none + implicit none - type(globs_t) :: globs - class(base_backend_t), pointer :: backend - class(allocator_t), pointer :: allocator - type(solver_t) :: solver - type(time_intg_t) :: time_integrator - type(dirps_t) :: xdirps, ydirps, zdirps + type(globs_t) :: globs + class(base_backend_t), pointer :: backend + class(allocator_t), pointer :: allocator + type(solver_t) :: solver + type(time_intg_t) :: time_integrator + type(dirps_t) :: xdirps, ydirps, zdirps #ifdef CUDA - type(cuda_backend_t), target :: cuda_backend - type(cuda_allocator_t), target :: cuda_allocator - integer :: ndevs, devnum + type(cuda_backend_t), target :: cuda_backend + type(cuda_allocator_t), target :: cuda_allocator + integer :: ndevs, devnum #else - type(omp_backend_t), target :: omp_backend - type(allocator_t), target :: omp_allocator + type(omp_backend_t), target :: omp_backend + type(allocator_t), target :: omp_allocator #endif - real(dp), allocatable, dimension(:, :, :) :: u, v, w + real(dp), allocatable, dimension(:, :, :) :: u, v, w - real(dp) :: t_start, t_end - integer :: dims(3) - integer :: nrank, nproc, ierr + real(dp) :: t_start, t_end + integer :: dims(3) + integer :: nrank, nproc, ierr - call MPI_Init(ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' #ifdef CUDA - ierr = cudaGetDeviceCount(ndevs) - ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin - ierr = cudaGetDevice(devnum) + ierr = cudaGetDeviceCount(ndevs) + ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin + ierr = cudaGetDevice(devnum) #endif - ! read L_x/y/z from the input file - globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi - xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz + ! read L_x/y/z from the input file + globs%Lx = 2*pi; globs%Ly = 2*pi; globs%Lz = 2*pi + xdirps%L = globs%Lx; ydirps%L = globs%Ly; zdirps%L = globs%Lz - ! read ns from the input file - globs%nx = 256; globs%ny = 256; globs%nz = 256 + ! read ns from the input file + globs%nx = 256; globs%ny = 256; globs%nz = 256 - globs%dt = 0.001_dp - globs%nu = 1._dp/1600._dp - globs%n_iters = 20000 - globs%n_output = 100 + globs%dt = 0.001_dp + globs%nu = 1._dp/1600._dp + globs%n_iters = 20000 + globs%n_output = 100 - ! set nprocs based on run time arguments - globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 + ! set nprocs based on run time arguments + globs%nproc_x = 1; globs%nproc_y = 1; globs%nproc_z = 1 - globs%poisson_solver_type = POISSON_SOLVER_FFT + globs%poisson_solver_type = POISSON_SOLVER_FFT - ! Lets allow a 1D decomposition for the moment - !globs%nproc_x = nproc + ! Lets allow a 1D decomposition for the moment + !globs%nproc_x = nproc - xdirps%nproc = globs%nproc_x - ydirps%nproc = globs%nproc_y - zdirps%nproc = globs%nproc_z + xdirps%nproc = globs%nproc_x + ydirps%nproc = globs%nproc_y + zdirps%nproc = globs%nproc_z - ! Better if we move this somewhere else - ! Set the pprev and pnext for each rank - call set_pprev_pnext( & + ! Better if we move this somewhere else + ! Set the pprev and pnext for each rank + call set_pprev_pnext( & xdirps%pprev, xdirps%pnext, & ydirps%pprev, ydirps%pnext, & zdirps%pprev, zdirps%pnext, & xdirps%nproc, ydirps%nproc, zdirps%nproc, nrank & - ) + ) - ! lets assume simple cases for now - globs%nx_loc = globs%nx/globs%nproc_x - globs%ny_loc = globs%ny/globs%nproc_y - globs%nz_loc = globs%nz/globs%nproc_z + ! lets assume simple cases for now + globs%nx_loc = globs%nx/globs%nproc_x + globs%ny_loc = globs%ny/globs%nproc_y + globs%nz_loc = globs%nz/globs%nproc_z - globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ - globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ - globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ + globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ + globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ + globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ - globs%dx = globs%Lx/globs%nx - globs%dy = globs%Ly/globs%ny - globs%dz = globs%Lz/globs%nz + globs%dx = globs%Lx/globs%nx + globs%dy = globs%Ly/globs%ny + globs%dz = globs%Lz/globs%nz - xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz + xdirps%d = globs%dx; ydirps%d = globs%dy; zdirps%d = globs%dz - xdirps%n = globs%nx_loc - ydirps%n = globs%ny_loc - zdirps%n = globs%nz_loc + xdirps%n = globs%nx_loc + ydirps%n = globs%ny_loc + zdirps%n = globs%nz_loc - xdirps%n_blocks = globs%n_groups_x - ydirps%n_blocks = globs%n_groups_y - zdirps%n_blocks = globs%n_groups_z + xdirps%n_blocks = globs%n_groups_x + ydirps%n_blocks = globs%n_groups_y + zdirps%n_blocks = globs%n_groups_z - xdirps%dir = DIR_X; ydirps%dir = DIR_Y; zdirps%dir = DIR_Z + xdirps%dir = DIR_X; ydirps%dir = DIR_Y; zdirps%dir = DIR_Z #ifdef CUDA - cuda_allocator = cuda_allocator_t(globs%nx_loc, globs%ny_loc, & - globs%nz_loc, SZ) - allocator => cuda_allocator - print*, 'CUDA allocator instantiated' - - cuda_backend = cuda_backend_t(globs, allocator) - backend => cuda_backend - print*, 'CUDA backend instantiated' + cuda_allocator = cuda_allocator_t(globs%nx_loc, globs%ny_loc, & + globs%nz_loc, SZ) + allocator => cuda_allocator + print*, 'CUDA allocator instantiated' + + cuda_backend = cuda_backend_t(globs, allocator) + backend => cuda_backend + print*, 'CUDA backend instantiated' #else - omp_allocator = allocator_t(globs%nx_loc, globs%ny_loc, globs%nz_loc, SZ) - allocator => omp_allocator - print*, 'OpenMP allocator instantiated' + omp_allocator = allocator_t(globs%nx_loc, globs%ny_loc, globs%nz_loc, SZ) + allocator => omp_allocator + print*, 'OpenMP allocator instantiated' - omp_backend = omp_backend_t(globs, allocator) - backend => omp_backend - print*, 'OpenMP backend instantiated' + omp_backend = omp_backend_t(globs, allocator) + backend => omp_backend + print*, 'OpenMP backend instantiated' #endif - dims(:) = allocator%cdims_padded - allocate (u(dims(1), dims(2), dims(3))) - allocate (v(dims(1), dims(2), dims(3))) - allocate (w(dims(1), dims(2), dims(3))) + dims(:) = allocator%cdims_padded + allocate (u(dims(1), dims(2), dims(3))) + allocate (v(dims(1), dims(2), dims(3))) + allocate (w(dims(1), dims(2), dims(3))) - time_integrator = time_intg_t(allocator=allocator, backend=backend) - print*, 'time integrator instantiated' - solver = solver_t(backend, time_integrator, xdirps, ydirps, zdirps, globs) - print*, 'solver instantiated' + time_integrator = time_intg_t(allocator=allocator, backend=backend) + print*, 'time integrator instantiated' + solver = solver_t(backend, time_integrator, xdirps, ydirps, zdirps, globs) + print*, 'solver instantiated' - call cpu_time(t_start) + call cpu_time(t_start) - call solver%run(u, v, w) + call solver%run(u, v, w) - call cpu_time(t_end) + call cpu_time(t_end) - print*, 'Time: ', t_end - t_start + print*, 'Time: ', t_end - t_start - print*, 'norms', norm2(u), norm2(v), norm2(w) + print*, 'norms', norm2(u), norm2(v), norm2(w) - call MPI_Finalize(ierr) + call MPI_Finalize(ierr) -end program xcompact + end program xcompact diff --git a/tests/cuda/test_cuda_allocator.f90 b/tests/cuda/test_cuda_allocator.f90 index a0d95ea3..8ce6191a 100644 --- a/tests/cuda/test_cuda_allocator.f90 +++ b/tests/cuda/test_cuda_allocator.f90 @@ -1,72 +1,72 @@ -program test_allocator_cuda - use iso_fortran_env, only: stderr => error_unit + program test_allocator_cuda + use iso_fortran_env, only: stderr => error_unit - use m_allocator, only: allocator_t, field_t - use m_common, only: DIR_X - use m_cuda_allocator, only: cuda_allocator_t + use m_allocator, only: allocator_t, field_t + use m_common, only: DIR_X + use m_cuda_allocator, only: cuda_allocator_t - implicit none + implicit none - logical :: allpass - integer, parameter :: dims(3) = [8, 8, 8] - class(allocator_t), allocatable :: allocator - class(field_t), pointer :: ptr1, ptr2, ptr3 - integer, allocatable :: l(:) + logical :: allpass + integer, parameter :: dims(3) = [8, 8, 8] + class(allocator_t), allocatable :: allocator + class(field_t), pointer :: ptr1, ptr2, ptr3 + integer, allocatable :: l(:) - allocator = cuda_allocator_t(dims(1), dims(2), dims(3), 8) + allocator = cuda_allocator_t(dims(1), dims(2), dims(3), 8) - allpass = .true. + allpass = .true. - ! Get the list of ids for free blocks. By default there are none - ! and returned list is [0]. - l = allocator%get_block_ids() - if (size(l) /= 1 .or. l(1) /= 0) then - allpass = .false. - write(stderr, '(a)') 'Free list is initialised empty... failed' - else - write(stderr, '(a)') 'Free list is initialised empty... passed' - end if + ! Get the list of ids for free blocks. By default there are none + ! and returned list is [0]. + l = allocator%get_block_ids() + if (size(l) /= 1 .or. l(1) /= 0) then + allpass = .false. + write(stderr, '(a)') 'Free list is initialised empty... failed' + else + write(stderr, '(a)') 'Free list is initialised empty... passed' + end if - ! Request two blocks and release them in reverse order. List should - ! contain two free blocks. (1 -> 2) - ptr1 => allocator%get_block(DIR_X) - ptr2 => allocator%get_block(DIR_X) - call allocator%release_block(ptr2) - call allocator%release_block(ptr1) + ! Request two blocks and release them in reverse order. List should + ! contain two free blocks. (1 -> 2) + ptr1 => allocator%get_block(DIR_X) + ptr2 => allocator%get_block(DIR_X) + call allocator%release_block(ptr2) + call allocator%release_block(ptr1) - if (.not. all(allocator%get_block_ids() .eq. [1, 2])) then - allpass = .false. - write(stderr, '(a)') 'Blocks are released correctly... failed' - else - write(stderr, '(a)') 'Blocks are released correctly... passed' - end if + if (.not. all(allocator%get_block_ids() .eq. [1, 2])) then + allpass = .false. + write(stderr, '(a)') 'Blocks are released correctly... failed' + else + write(stderr, '(a)') 'Blocks are released correctly... passed' + end if - !! Destroy the free list and check that the list is empty again. - call allocator%destroy() - l = allocator%get_block_ids() - if (size(l) /= 1 .or. l(1) /= 0 .or. allocator%next_id /=0) then - allpass = .false. - write(stderr, '(a)') 'Free list is correctly destroyed... failed' - else - write(stderr, '(a)') 'Free list is correctly destroyed... passed' - end if + !! Destroy the free list and check that the list is empty again. + call allocator%destroy() + l = allocator%get_block_ids() + if (size(l) /= 1 .or. l(1) /= 0 .or. allocator%next_id /=0) then + allpass = .false. + write(stderr, '(a)') 'Free list is correctly destroyed... failed' + else + write(stderr, '(a)') 'Free list is correctly destroyed... passed' + end if - ! Request a block from a list of three. This should grab the first - ! block on top of the pile and reduce the free list to two blocks. - ptr1 => allocator%get_block(DIR_X) - ptr2 => allocator%get_block(DIR_X) - ptr3 => allocator%get_block(DIR_X) - call allocator%release_block(ptr3) - call allocator%release_block(ptr2) - call allocator%release_block(ptr1) - ptr1 => allocator%get_block(DIR_X) + ! Request a block from a list of three. This should grab the first + ! block on top of the pile and reduce the free list to two blocks. + ptr1 => allocator%get_block(DIR_X) + ptr2 => allocator%get_block(DIR_X) + ptr3 => allocator%get_block(DIR_X) + call allocator%release_block(ptr3) + call allocator%release_block(ptr2) + call allocator%release_block(ptr1) + ptr1 => allocator%get_block(DIR_X) - if (.not. all(allocator%get_block_ids() .eq. [2, 3])) then - allpass = .false. - write(stderr, '(a)') 'Block is correctly allocated... failed' - else - write(stderr, '(a)') 'Block is correctly allocated... passed' - end if + if (.not. all(allocator%get_block_ids() .eq. [2, 3])) then + allpass = .false. + write(stderr, '(a)') 'Block is correctly allocated... failed' + else + write(stderr, '(a)') 'Block is correctly allocated... passed' + end if - call allocator%destroy() -end program test_allocator_cuda + call allocator%destroy() + end program test_allocator_cuda diff --git a/tests/cuda/test_cuda_reorder.f90 b/tests/cuda/test_cuda_reorder.f90 index d3bbf97c..b60e78ab 100644 --- a/tests/cuda/test_cuda_reorder.f90 +++ b/tests/cuda/test_cuda_reorder.f90 @@ -1,235 +1,235 @@ -program test_cuda_reorder - use iso_fortran_env, only: stderr => error_unit - use cudafor + program test_cuda_reorder + use iso_fortran_env, only: stderr => error_unit + use cudafor - use m_common, only: dp - use m_cuda_common, only: SZ - use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, & - reorder_y2z, reorder_z2x, reorder_z2y, & - reorder_c2x, reorder_x2c + use m_common, only: dp + use m_cuda_common, only: SZ + use m_cuda_kernels_reorder, only: reorder_x2y, reorder_x2z, reorder_y2x, & + reorder_y2z, reorder_z2x, reorder_z2y, & + reorder_c2x, reorder_x2c - implicit none + implicit none - logical :: allpass = .true. - real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp, u_c - real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, & - u_temp_d, u_c_d + logical :: allpass = .true. + real(dp), allocatable, dimension(:, :, :) :: u_i, u_o, u_temp, u_c + real(dp), device, allocatable, dimension(:, :, :) :: u_i_d, u_o_d, & + u_temp_d, u_c_d - integer :: n_block, i, n_iters - integer :: nx, ny, nz, ndof + integer :: n_block, i, n_iters + integer :: nx, ny, nz, ndof - type(dim3) :: blocks, threads - real(dp) :: norm_u, tol = 1d-8, tstart, tend + type(dim3) :: blocks, threads + real(dp) :: norm_u, tol = 1d-8, tstart, tend - nx = 512; ny = 512; nz = 512 - n_block = ny*nz/SZ - ndof = nx*ny*nz - n_iters = 100 + nx = 512; ny = 512; nz = 512 + n_block = ny*nz/SZ + ndof = nx*ny*nz + n_iters = 100 - allocate (u_i(SZ, nx, n_block), u_o(SZ, nx, n_block)) - allocate (u_temp(SZ, nx, n_block)) - allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block)) - allocate (u_temp_d(SZ, nx, n_block)) + allocate (u_i(SZ, nx, n_block), u_o(SZ, nx, n_block)) + allocate (u_temp(SZ, nx, n_block)) + allocate (u_i_d(SZ, nx, n_block), u_o_d(SZ, nx, n_block)) + allocate (u_temp_d(SZ, nx, n_block)) - ! Cartesian order storage - allocate (u_c_d(nx, ny, nz)) - allocate (u_c(nx, ny, nz)) + ! Cartesian order storage + allocate (u_c_d(nx, ny, nz)) + allocate (u_c(nx, ny, nz)) - ! set a random field - call random_number(u_i) + ! set a random field + call random_number(u_i) - ! move data to device - u_i_d = u_i + ! move data to device + u_i_d = u_i - ! do a x to y reordering first and then a y to x - blocks = dim3(nx/SZ, nz, ny/SZ) - threads = dim3(SZ, SZ, 1) - call reorder_x2y<<>>(u_temp_d, u_i_d, nz) + ! do a x to y reordering first and then a y to x + blocks = dim3(nx/SZ, nz, ny/SZ) + threads = dim3(SZ, SZ, 1) + call reorder_x2y<<>>(u_temp_d, u_i_d, nz) - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call reorder_y2x<<>>(u_o_d, u_temp_d, nz) + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_y2x<<>>(u_o_d, u_temp_d, nz) - ! move the result back to host - u_o = u_o_d + ! move the result back to host + u_o = u_o_d - ! and check whether it matches the initial random field - norm_u = norm2(u_o - u_i) - if ( norm_u > tol ) then + ! and check whether it matches the initial random field + norm_u = norm2(u_o - u_i) + if ( norm_u > tol ) then allpass = .false. write(stderr, '(a)') 'Check reorder x2y and y2x... failed' - else + else write(stderr, '(a)') 'Check reorder x2y and y2x... passed' - end if + end if - ! we reuse u_o_d so zeroize in any case - u_o_d = 0 + ! we reuse u_o_d so zeroize in any case + u_o_d = 0 - ! u_temp_d is in y orientation, use y2z to reorder it into z direction - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call reorder_y2z<<>>(u_o_d, u_temp_d, nx, nz) + ! u_temp_d is in y orientation, use y2z to reorder it into z direction + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_y2z<<>>(u_o_d, u_temp_d, nx, nz) - ! store initial z oriented field - u_temp = u_temp_d + ! store initial z oriented field + u_temp = u_temp_d - ! z oriented field into y - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call reorder_z2y<<>>(u_temp_d, u_o_d, nx, nz) + ! z oriented field into y + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_z2y<<>>(u_temp_d, u_o_d, nx, nz) - u_o = u_temp_d + u_o = u_temp_d - ! compare two y oriented fields - norm_u = norm2(u_o - u_temp) - if ( norm_u > tol ) then + ! compare two y oriented fields + norm_u = norm2(u_o - u_temp) + if ( norm_u > tol ) then allpass = .false. write(stderr, '(a)') 'Check reorder y2z and y2z... failed' - else + else write(stderr, '(a)') 'Check reorder y2z and y2z... passed' - end if - - ! reorder initial random field into z orientation - blocks = dim3(nx, ny/SZ, 1) - threads = dim3(SZ, 1, 1) - call reorder_x2z<<>>(u_o_d, u_i_d, nz) - - ! z oriented field into x - blocks = dim3(nx, ny/SZ, 1) - threads = dim3(SZ, 1, 1) - call reorder_z2x<<>>(u_temp_d, u_o_d, nz) - u_o = u_temp_d - - ! compare two z oriented fields - norm_u = norm2(u_o - u_i) - if ( norm_u > tol ) then + end if + + ! reorder initial random field into z orientation + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_x2z<<>>(u_o_d, u_i_d, nz) + + ! z oriented field into x + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call reorder_z2x<<>>(u_temp_d, u_o_d, nz) + u_o = u_temp_d + + ! compare two z oriented fields + norm_u = norm2(u_o - u_i) + if ( norm_u > tol ) then allpass = .false. write(stderr, '(a)') 'Check reorder x2z and z2x... failed' - else + else write(stderr, '(a)') 'Check reorder x2z and z2x... passed' - end if + end if - ! x ordering into Cartesian ordering - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call reorder_x2c<<>>(u_c_d, u_i_d, nz) + ! x ordering into Cartesian ordering + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call reorder_x2c<<>>(u_c_d, u_i_d, nz) - ! sanitise u_o_d - u_o_d = 0 + ! sanitise u_o_d + u_o_d = 0 - ! Cartesian ordering back to x ordering - call reorder_c2x<<>>(u_o_d, u_c_d, nz) - u_o = u_o_d + ! Cartesian ordering back to x ordering + call reorder_c2x<<>>(u_o_d, u_c_d, nz) + u_o = u_o_d - ! now both u_o and u_i in x ordering, compare them - norm_u = norm2(u_o - u_i) - if ( norm_u > tol ) then + ! now both u_o and u_i in x ordering, compare them + norm_u = norm2(u_o - u_i) + if ( norm_u > tol ) then allpass = .false. write(stderr, '(a)') 'Check reorder x2c and c2x... failed' - else + else write(stderr, '(a)') 'Check reorder x2c and c2x... passed' - end if + end if - if (allpass) then + if (allpass) then write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else + else error stop 'SOME TESTS FAILED.' - end if + end if - ! Now the performance checks + ! Now the performance checks - print*, 'Performance test: reorder_x2y' - blocks = dim3(nx/SZ, nz, ny/SZ) - threads = dim3(SZ, SZ, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_x2y' + blocks = dim3(nx/SZ, nz, ny/SZ) + threads = dim3(SZ, SZ, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_x2y<<>>(u_o_d, u_i_d, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) - print*, 'Performance test: reorder_x2z' - blocks = dim3(nx, ny/SZ, 1) - threads = dim3(SZ, 1, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_x2z' + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_x2z<<>>(u_o_d, u_i_d, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) - print*, 'Performance test: reorder_y2x' - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_y2x' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_y2x<<>>(u_o_d, u_i_d, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) - print*, 'Performance test: reorder_y2z' - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_y2z' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_y2z<<>>(u_o_d, u_i_d, nx, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) - print*, 'Performance test: reorder_z2x' - blocks = dim3(nx, ny/SZ, 1) - threads = dim3(SZ, 1, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_z2x' + blocks = dim3(nx, ny/SZ, 1) + threads = dim3(SZ, 1, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_z2x<<>>(u_o_d, u_i_d, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) - print*, 'Performance test: reorder_z2y' - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_z2y' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_z2y<<>>(u_o_d, u_i_d, nx, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) - print*, 'Performance test: reorder_x2c' - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_x2c' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_x2c<<>>(u_c_d, u_i_d, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) - print*, 'Performance test: reorder_c2x' - blocks = dim3(nx/SZ, ny/SZ, nz) - threads = dim3(SZ, SZ, 1) - call cpu_time(tstart) - do i = 1, n_iters + print*, 'Performance test: reorder_c2x' + blocks = dim3(nx/SZ, ny/SZ, nz) + threads = dim3(SZ, SZ, 1) + call cpu_time(tstart) + do i = 1, n_iters call reorder_c2x<<>>(u_o_d, u_c_d, nz) - end do - call cpu_time(tend) + end do + call cpu_time(tend) - call checkperf(tend - tstart, n_iters, ndof, 2._dp) + call checkperf(tend - tstart, n_iters, ndof, 2._dp) -contains + contains - subroutine checkperf(t_tot, n_iters, ndof, consumed_bw) + subroutine checkperf(t_tot, n_iters, ndof, consumed_bw) implicit none real(dp), intent(in) :: t_tot, consumed_bw @@ -245,13 +245,13 @@ subroutine checkperf(t_tot, n_iters, ndof, consumed_bw) ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) ierr = cudaDeviceGetAttribute(memBusWidth, & - cudaDevAttrGlobalMemoryBusWidth, 0) + cudaDevAttrGlobalMemoryBusWidth, 0) devBW = 2*memBusWidth/8._dp*memClockRt*1000 print'(a, f8.3, a)', 'Device BW: ', devBW/2**30, ' GiB/s' print'(a, f5.2)', 'Effective BW util: %', achievedBW/devBW*100 - end subroutine checkperf + end subroutine checkperf -end program test_cuda_reorder + end program test_cuda_reorder diff --git a/tests/cuda/test_cuda_transeq.f90 b/tests/cuda/test_cuda_transeq.f90 index b1f46ddf..cbd9b1c7 100644 --- a/tests/cuda/test_cuda_transeq.f90 +++ b/tests/cuda/test_cuda_transeq.f90 @@ -1,120 +1,120 @@ -program test_cuda_tridiag - use iso_fortran_env, only: stderr => error_unit - use cudafor - use mpi - - use m_common, only: dp, pi - use m_cuda_common, only: SZ - use m_cuda_exec_dist, only: exec_dist_transeq_3fused - use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields - use m_cuda_tdsops, only: cuda_tdsops_t - - implicit none - - logical :: allpass = .true. - real(dp), allocatable, dimension(:, :, :) :: u, v, r_u - real(dp), device, allocatable, dimension(:, :, :) :: & + program test_cuda_tridiag + use iso_fortran_env, only: stderr => error_unit + use cudafor + use mpi + + use m_common, only: dp, pi + use m_cuda_common, only: SZ + use m_cuda_exec_dist, only: exec_dist_transeq_3fused + use m_cuda_sendrecv, only: sendrecv_fields, sendrecv_3fields + use m_cuda_tdsops, only: cuda_tdsops_t + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:, :, :) :: u, v, r_u + real(dp), device, allocatable, dimension(:, :, :) :: & u_dev, v_dev, r_u_dev, & ! main fields u, v and result r_u du_dev, dud_dev, d2u_dev ! intermediate solution arrays - real(dp), device, allocatable, dimension(:, :, :) :: & + real(dp), device, allocatable, dimension(:, :, :) :: & du_recv_s_dev, du_recv_e_dev, du_send_s_dev, du_send_e_dev, & dud_recv_s_dev, dud_recv_e_dev, dud_send_s_dev, dud_send_e_dev, & d2u_recv_s_dev, d2u_recv_e_dev, d2u_send_s_dev, d2u_send_e_dev - real(dp), device, allocatable, dimension(:, :, :) :: & + real(dp), device, allocatable, dimension(:, :, :) :: & u_send_s_dev, u_send_e_dev, u_recv_s_dev, u_recv_e_dev, & v_send_s_dev, v_send_e_dev, v_recv_s_dev, v_recv_e_dev - type(cuda_tdsops_t) :: der1st, der2nd + type(cuda_tdsops_t) :: der1st, der2nd - integer :: n, n_block, i, j, k, n_halo, n_iters - integer :: n_glob - integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234 - integer :: srerr(12), mpireq(12) - integer :: ierr, ndevs, devnum, memClockRt, memBusWidth + integer :: n, n_block, i, j, k, n_halo, n_iters + integer :: n_glob + integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234 + integer :: srerr(12), mpireq(12) + integer :: ierr, ndevs, devnum, memClockRt, memBusWidth - type(dim3) :: blocks, threads - real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8, tstart, tend - real(dp) :: achievedBW, deviceBW, achievedBWmax, achievedBWmin + type(dim3) :: blocks, threads + real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8, tstart, tend + real(dp) :: achievedBW, deviceBW, achievedBWmax, achievedBWmin - call MPI_Init(ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' - ierr = cudaGetDeviceCount(ndevs) - ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin - ierr = cudaGetDevice(devnum) + ierr = cudaGetDeviceCount(ndevs) + ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin + ierr = cudaGetDevice(devnum) - !print*, 'I am rank', nrank, 'I am running on device', devnum - pnext = modulo(nrank - nproc + 1, nproc) - pprev = modulo(nrank - 1, nproc) + !print*, 'I am rank', nrank, 'I am running on device', devnum + pnext = modulo(nrank - nproc + 1, nproc) + pprev = modulo(nrank - 1, nproc) - n_glob = 512 - n = n_glob/nproc - n_block = 512*512/SZ - n_iters = 100 + n_glob = 512 + n = n_glob/nproc + n_block = 512*512/SZ + n_iters = 100 - nu = 1._dp + nu = 1._dp - allocate(u(SZ, n, n_block), v(SZ, n, n_block), r_u(SZ, n, n_block)) + allocate(u(SZ, n, n_block), v(SZ, n, n_block), r_u(SZ, n, n_block)) - ! main input fields - allocate(u_dev(SZ, n, n_block), v_dev(SZ, n, n_block)) - ! field for storing the result - allocate(r_u_dev(SZ, n, n_block)) - ! intermediate solution fields - allocate(du_dev(SZ, n, n_block)) - allocate(dud_dev(SZ, n, n_block)) - allocate(d2u_dev(SZ, n, n_block)) + ! main input fields + allocate(u_dev(SZ, n, n_block), v_dev(SZ, n, n_block)) + ! field for storing the result + allocate(r_u_dev(SZ, n, n_block)) + ! intermediate solution fields + allocate(du_dev(SZ, n, n_block)) + allocate(dud_dev(SZ, n, n_block)) + allocate(d2u_dev(SZ, n, n_block)) - dx_per = 2*pi/n_glob - dx = 2*pi/(n_glob - 1) + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) - do k = 1, n_block + do k = 1, n_block do j = 1, n - do i = 1, SZ - u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) - v(i, j, k) = cos((j - 1 + nrank*n)*dx_per) - end do + do i = 1, SZ + u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + v(i, j, k) = cos((j - 1 + nrank*n)*dx_per) + end do end do - end do - - ! move data to device - u_dev = u - v_dev = v - - n_halo = 4 - - ! arrays for exchanging data between ranks - allocate(u_send_s_dev(SZ, n_halo, n_block)) - allocate(u_send_e_dev(SZ, n_halo, n_block)) - allocate(u_recv_s_dev(SZ, n_halo, n_block)) - allocate(u_recv_e_dev(SZ, n_halo, n_block)) - allocate(v_send_s_dev(SZ, n_halo, n_block)) - allocate(v_send_e_dev(SZ, n_halo, n_block)) - allocate(v_recv_s_dev(SZ, n_halo, n_block)) - allocate(v_recv_e_dev(SZ, n_halo, n_block)) - - allocate(du_send_s_dev(SZ, 1, n_block), du_send_e_dev(SZ, 1, n_block)) - allocate(du_recv_s_dev(SZ, 1, n_block), du_recv_e_dev(SZ, 1, n_block)) - allocate(dud_send_s_dev(SZ, 1, n_block), dud_send_e_dev(SZ, 1, n_block)) - allocate(dud_recv_s_dev(SZ, 1, n_block), dud_recv_e_dev(SZ, 1, n_block)) - allocate(d2u_send_s_dev(SZ, 1, n_block), d2u_send_e_dev(SZ, 1, n_block)) - allocate(d2u_recv_s_dev(SZ, 1, n_block), d2u_recv_e_dev(SZ, 1, n_block)) - - ! preprocess the operator and coefficient arrays - der1st = cuda_tdsops_t(n, dx_per, operation='first-deriv', & - scheme='compact6') - der2nd = cuda_tdsops_t(n, dx_per, operation='second-deriv', & - scheme='compact6') - - blocks = dim3(n_block, 1, 1) - threads = dim3(SZ, 1, 1) - - call cpu_time(tstart) - do i = 1, n_iters + end do + + ! move data to device + u_dev = u + v_dev = v + + n_halo = 4 + + ! arrays for exchanging data between ranks + allocate(u_send_s_dev(SZ, n_halo, n_block)) + allocate(u_send_e_dev(SZ, n_halo, n_block)) + allocate(u_recv_s_dev(SZ, n_halo, n_block)) + allocate(u_recv_e_dev(SZ, n_halo, n_block)) + allocate(v_send_s_dev(SZ, n_halo, n_block)) + allocate(v_send_e_dev(SZ, n_halo, n_block)) + allocate(v_recv_s_dev(SZ, n_halo, n_block)) + allocate(v_recv_e_dev(SZ, n_halo, n_block)) + + allocate(du_send_s_dev(SZ, 1, n_block), du_send_e_dev(SZ, 1, n_block)) + allocate(du_recv_s_dev(SZ, 1, n_block), du_recv_e_dev(SZ, 1, n_block)) + allocate(dud_send_s_dev(SZ, 1, n_block), dud_send_e_dev(SZ, 1, n_block)) + allocate(dud_recv_s_dev(SZ, 1, n_block), dud_recv_e_dev(SZ, 1, n_block)) + allocate(d2u_send_s_dev(SZ, 1, n_block), d2u_send_e_dev(SZ, 1, n_block)) + allocate(d2u_recv_s_dev(SZ, 1, n_block), d2u_recv_e_dev(SZ, 1, n_block)) + + ! preprocess the operator and coefficient arrays + der1st = cuda_tdsops_t(n, dx_per, operation='first-deriv', & + scheme='compact6') + der2nd = cuda_tdsops_t(n, dx_per, operation='second-deriv', & + scheme='compact6') + + blocks = dim3(n_block, 1, 1) + threads = dim3(SZ, 1, 1) + + call cpu_time(tstart) + do i = 1, n_iters u_send_s_dev(:, :, :) = u_dev(:, 1:4, :) u_send_e_dev(:, :, :) = u_dev(:, n - n_halo + 1:n, :) v_send_s_dev(:, :, :) = v_dev(:, 1:4, :) @@ -123,79 +123,79 @@ program test_cuda_tridiag ! halo exchange call sendrecv_fields(u_recv_s_dev, u_recv_e_dev, & - u_send_s_dev, u_send_e_dev, & - SZ*4*n_block, nproc, pprev, pnext) + u_send_s_dev, u_send_e_dev, & + SZ*4*n_block, nproc, pprev, pnext) call sendrecv_fields(v_recv_s_dev, v_recv_e_dev, & - v_send_s_dev, v_send_e_dev, & - SZ*4*n_block, nproc, pprev, pnext) + v_send_s_dev, v_send_e_dev, & + SZ*4*n_block, nproc, pprev, pnext) call exec_dist_transeq_3fused( & - r_u_dev, & - u_dev, u_recv_s_dev, u_recv_e_dev, & - v_dev, v_recv_s_dev, v_recv_e_dev, & - du_dev, dud_dev, d2u_dev, & - du_send_s_dev, du_send_e_dev, du_recv_s_dev, du_recv_e_dev, & - dud_send_s_dev, dud_send_e_dev, dud_recv_s_dev, dud_recv_e_dev, & - d2u_send_s_dev, d2u_send_e_dev, d2u_recv_s_dev, d2u_recv_e_dev, & - der1st, der2nd, nu, nproc, pprev, pnext, blocks, threads & - ) - end do - - call cpu_time(tend) - if (nrank == 0) print*, 'Total time', tend - tstart - - ! BW utilisation and performance checks - ! 11 in the first phase, 5 in the second phase, 16 in total - achievedBW = 16._dp*n_iters*n*n_block*SZ*dp/(tend - tstart) - call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & - MPI_MAX, MPI_COMM_WORLD, ierr) - call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & - MPI_MIN, MPI_COMM_WORLD, ierr) - - if (nrank == 0) then + r_u_dev, & + u_dev, u_recv_s_dev, u_recv_e_dev, & + v_dev, v_recv_s_dev, v_recv_e_dev, & + du_dev, dud_dev, d2u_dev, & + du_send_s_dev, du_send_e_dev, du_recv_s_dev, du_recv_e_dev, & + dud_send_s_dev, dud_send_e_dev, dud_recv_s_dev, dud_recv_e_dev, & + d2u_send_s_dev, d2u_send_e_dev, d2u_recv_s_dev, d2u_recv_e_dev, & + der1st, der2nd, nu, nproc, pprev, pnext, blocks, threads & + ) + end do + + call cpu_time(tend) + if (nrank == 0) print*, 'Total time', tend - tstart + + ! BW utilisation and performance checks + ! 11 in the first phase, 5 in the second phase, 16 in total + achievedBW = 16._dp*n_iters*n*n_block*SZ*dp/(tend - tstart) + call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & + MPI_MAX, MPI_COMM_WORLD, ierr) + call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & + MPI_MIN, MPI_COMM_WORLD, ierr) + + if (nrank == 0) then print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s' print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s' - end if + end if - ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) - ierr = cudaDeviceGetAttribute(memBusWidth, & - cudaDevAttrGlobalMemoryBusWidth, 0) - deviceBW = 2*memBusWidth/8._dp*memClockRt*1000 + ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) + ierr = cudaDeviceGetAttribute(memBusWidth, & + cudaDevAttrGlobalMemoryBusWidth, 0) + deviceBW = 2*memBusWidth/8._dp*memClockRt*1000 - if (nrank == 0) then + if (nrank == 0) then print'(a, f8.3, a)', 'Device BW: ', deviceBW/2**30, ' GiB/s' print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/deviceBW*100 print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/deviceBW*100 - end if + end if - ! check error - r_u = r_u_dev - r_u = r_u - (-v*v + 0.5_dp*u*u - nu*u) - norm_du = norm2(r_u) - norm_du = norm_du*norm_du/n_glob/n_block/SZ - call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & - MPI_SUM, MPI_COMM_WORLD, ierr) - norm_du = sqrt(norm_du) + ! check error + r_u = r_u_dev + r_u = r_u - (-v*v + 0.5_dp*u*u - nu*u) + norm_du = norm2(r_u) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) - if (nrank == 0) print*, 'error norm', norm_du + if (nrank == 0) print*, 'error norm', norm_du - if (nrank == 0) then + if (nrank == 0) then if ( norm_du > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check second derivatives... failed' + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' else - write(stderr, '(a)') 'Check second derivatives... passed' + write(stderr, '(a)') 'Check second derivatives... passed' end if - end if + end if - if (allpass) then + if (allpass) then if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else + else error stop 'SOME TESTS FAILED.' - end if + end if - call MPI_Finalize(ierr) + call MPI_Finalize(ierr) -end program test_cuda_tridiag + end program test_cuda_tridiag diff --git a/tests/cuda/test_cuda_tridiag.f90 b/tests/cuda/test_cuda_tridiag.f90 index e8cb7157..97c3607a 100644 --- a/tests/cuda/test_cuda_tridiag.f90 +++ b/tests/cuda/test_cuda_tridiag.f90 @@ -1,161 +1,161 @@ -program test_cuda_tridiag - use iso_fortran_env, only: stderr => error_unit - use cudafor - use mpi - - use m_common, only: dp, pi - use m_cuda_common, only: SZ - use m_cuda_exec_dist, only: exec_dist_tds_compact - use m_cuda_sendrecv, only: sendrecv_fields - use m_cuda_tdsops, only: cuda_tdsops_t, cuda_tdsops_init - - implicit none - - logical :: allpass = .true. - real(dp), allocatable, dimension(:, :, :) :: u, du - real(dp), device, allocatable, dimension(:, :, :) :: u_dev, du_dev - real(dp), device, allocatable, dimension(:, :, :) :: & + program test_cuda_tridiag + use iso_fortran_env, only: stderr => error_unit + use cudafor + use mpi + + use m_common, only: dp, pi + use m_cuda_common, only: SZ + use m_cuda_exec_dist, only: exec_dist_tds_compact + use m_cuda_sendrecv, only: sendrecv_fields + use m_cuda_tdsops, only: cuda_tdsops_t, cuda_tdsops_init + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:, :, :) :: u, du + real(dp), device, allocatable, dimension(:, :, :) :: u_dev, du_dev + real(dp), device, allocatable, dimension(:, :, :) :: & u_recv_s_dev, u_recv_e_dev, u_send_s_dev, u_send_e_dev - real(dp), device, allocatable, dimension(:, :, :) :: & + real(dp), device, allocatable, dimension(:, :, :) :: & du_send_s_dev, du_send_e_dev, du_recv_s_dev, du_recv_e_dev - type(cuda_tdsops_t) :: tdsops + type(cuda_tdsops_t) :: tdsops - integer :: n, n_block, i, j, k, n_halo, n_iters - integer :: n_glob - integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234 - integer :: srerr(4), mpireq(4) - integer :: ierr, ndevs, devnum, memClockRt, memBusWidth + integer :: n, n_block, i, j, k, n_halo, n_iters + integer :: n_glob + integer :: nrank, nproc, pprev, pnext, tag1=1234, tag2=1234 + integer :: srerr(4), mpireq(4) + integer :: ierr, ndevs, devnum, memClockRt, memBusWidth - type(dim3) :: blocks, threads - real(dp) :: dx, dx_per, norm_du, tol = 1d-8, tstart, tend - real(dp) :: achievedBW, deviceBW, achievedBWmax, achievedBWmin + type(dim3) :: blocks, threads + real(dp) :: dx, dx_per, norm_du, tol = 1d-8, tstart, tend + real(dp) :: achievedBW, deviceBW, achievedBWmax, achievedBWmin - call MPI_Init(ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' - ierr = cudaGetDeviceCount(ndevs) - ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin - ierr = cudaGetDevice(devnum) + ierr = cudaGetDeviceCount(ndevs) + ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin + ierr = cudaGetDevice(devnum) - !print*, 'I am rank', nrank, 'I am running on device', devnum - pnext = modulo(nrank - nproc + 1, nproc) - pprev = modulo(nrank - 1, nproc) + !print*, 'I am rank', nrank, 'I am running on device', devnum + pnext = modulo(nrank - nproc + 1, nproc) + pprev = modulo(nrank - 1, nproc) - n_glob = 512*2 - n = n_glob/nproc - n_block = 512*512/SZ - n_iters = 100 + n_glob = 512*2 + n = n_glob/nproc + n_block = 512*512/SZ + n_iters = 100 - allocate(u(SZ, n, n_block), du(SZ, n, n_block)) - allocate(u_dev(SZ, n, n_block), du_dev(SZ, n, n_block)) + allocate(u(SZ, n, n_block), du(SZ, n, n_block)) + allocate(u_dev(SZ, n, n_block), du_dev(SZ, n, n_block)) - dx_per = 2*pi/n_glob - dx = 2*pi/(n_glob - 1) + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) - do k = 1, n_block + do k = 1, n_block do j = 1, n - do i = 1, SZ - u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) - end do + do i = 1, SZ + u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + end do end do - end do + end do - ! move data to device - u_dev = u + ! move data to device + u_dev = u - n_halo = 4 + n_halo = 4 - ! arrays for exchanging data between ranks - allocate(u_send_s_dev(SZ, n_halo, n_block)) - allocate(u_send_e_dev(SZ, n_halo, n_block)) - allocate(u_recv_s_dev(SZ, n_halo, n_block)) - allocate(u_recv_e_dev(SZ, n_halo, n_block)) + ! arrays for exchanging data between ranks + allocate(u_send_s_dev(SZ, n_halo, n_block)) + allocate(u_send_e_dev(SZ, n_halo, n_block)) + allocate(u_recv_s_dev(SZ, n_halo, n_block)) + allocate(u_recv_e_dev(SZ, n_halo, n_block)) - allocate(du_send_s_dev(SZ, 1, n_block), du_send_e_dev(SZ, 1, n_block)) - allocate(du_recv_s_dev(SZ, 1, n_block), du_recv_e_dev(SZ, 1, n_block)) + allocate(du_send_s_dev(SZ, 1, n_block), du_send_e_dev(SZ, 1, n_block)) + allocate(du_recv_s_dev(SZ, 1, n_block), du_recv_e_dev(SZ, 1, n_block)) - ! preprocess the operator and coefficient arrays - tdsops = cuda_tdsops_init(n, dx_per, operation='second-deriv', & - scheme='compact6') + ! preprocess the operator and coefficient arrays + tdsops = cuda_tdsops_init(n, dx_per, operation='second-deriv', & + scheme='compact6') - blocks = dim3(n_block, 1, 1) - threads = dim3(SZ, 1, 1) + blocks = dim3(n_block, 1, 1) + threads = dim3(SZ, 1, 1) - call cpu_time(tstart) - do i = 1, n_iters + call cpu_time(tstart) + do i = 1, n_iters u_send_s_dev(:, :, :) = u_dev(:, 1:4, :) u_send_e_dev(:, :, :) = u_dev(:, n - n_halo + 1:n, :) ! halo exchange call sendrecv_fields(u_recv_s_dev, u_recv_e_dev, & - u_send_s_dev, u_send_e_dev, & - SZ*4*n_block, nproc, pprev, pnext) + u_send_s_dev, u_send_e_dev, & + SZ*4*n_block, nproc, pprev, pnext) call exec_dist_tds_compact( & - du_dev, u_dev, u_recv_s_dev, u_recv_e_dev, & - du_send_s_dev, du_send_e_dev, du_recv_s_dev, du_recv_e_dev, & - tdsops, nproc, pprev, pnext, blocks, threads & - ) - end do - - call cpu_time(tend) - if (nrank == 0) print*, 'Total time', tend - tstart - - ! BW utilisation and performance checks - ! 4 in the first phase, 2 in the second phase, 6 in total - achievedBW = 6._dp*n_iters*n*n_block*SZ*dp/(tend - tstart) - call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & - MPI_MAX, MPI_COMM_WORLD, ierr) - call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & - MPI_MIN, MPI_COMM_WORLD, ierr) - - if (nrank == 0) then + du_dev, u_dev, u_recv_s_dev, u_recv_e_dev, & + du_send_s_dev, du_send_e_dev, du_recv_s_dev, du_recv_e_dev, & + tdsops, nproc, pprev, pnext, blocks, threads & + ) + end do + + call cpu_time(tend) + if (nrank == 0) print*, 'Total time', tend - tstart + + ! BW utilisation and performance checks + ! 4 in the first phase, 2 in the second phase, 6 in total + achievedBW = 6._dp*n_iters*n*n_block*SZ*dp/(tend - tstart) + call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & + MPI_MAX, MPI_COMM_WORLD, ierr) + call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & + MPI_MIN, MPI_COMM_WORLD, ierr) + + if (nrank == 0) then print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s' print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s' - end if + end if - ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) - ierr = cudaDeviceGetAttribute(memBusWidth, & - cudaDevAttrGlobalMemoryBusWidth, 0) - deviceBW = 2*memBusWidth/8._dp*memClockRt*1000 + ierr = cudaDeviceGetAttribute(memClockRt, cudaDevAttrMemoryClockRate, 0) + ierr = cudaDeviceGetAttribute(memBusWidth, & + cudaDevAttrGlobalMemoryBusWidth, 0) + deviceBW = 2*memBusWidth/8._dp*memClockRt*1000 - if (nrank == 0) then + if (nrank == 0) then print'(a, f8.3, a)', 'Device BW: ', deviceBW/2**30, ' GiB/s' print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/deviceBW*100 print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/deviceBW*100 - end if + end if - ! check error - du = du_dev - norm_du = norm2(u + du) - norm_du = norm_du*norm_du/n_glob/n_block/SZ - call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & - MPI_SUM, MPI_COMM_WORLD, ierr) - norm_du = sqrt(norm_du) + ! check error + du = du_dev + norm_du = norm2(u + du) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) - if (nrank == 0) print*, 'error norm', norm_du + if (nrank == 0) print*, 'error norm', norm_du - if (nrank == 0) then + if (nrank == 0) then if ( norm_du > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check second derivatives... failed' + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' else - write(stderr, '(a)') 'Check second derivatives... passed' + write(stderr, '(a)') 'Check second derivatives... passed' end if - end if + end if - if (allpass) then + if (allpass) then if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else + else error stop 'SOME TESTS FAILED.' - end if + end if - call MPI_Finalize(ierr) + call MPI_Finalize(ierr) -end program test_cuda_tridiag + end program test_cuda_tridiag diff --git a/tests/omp/test_omp_dist_transeq.f90 b/tests/omp/test_omp_dist_transeq.f90 index 1a46df80..a5185de6 100644 --- a/tests/omp/test_omp_dist_transeq.f90 +++ b/tests/omp/test_omp_dist_transeq.f90 @@ -1,115 +1,115 @@ -program test_transeq - use iso_fortran_env, only: stderr => error_unit - use mpi - - use m_common, only: dp, pi - use m_omp_common, only: SZ - use m_omp_exec_dist, only: exec_dist_transeq_compact - use m_omp_sendrecv, only: sendrecv_fields - use m_tdsops, only: tdsops_t - - implicit none - - logical :: allpass = .true. - real(dp), allocatable, dimension(:, :, :) :: u, v, r_u - real(dp), allocatable, dimension(:, :, :) :: du, dud, d2u ! intermediate solution arrays - real(dp), allocatable, dimension(:, :, :) :: & + program test_transeq + use iso_fortran_env, only: stderr => error_unit + use mpi + + use m_common, only: dp, pi + use m_omp_common, only: SZ + use m_omp_exec_dist, only: exec_dist_transeq_compact + use m_omp_sendrecv, only: sendrecv_fields + use m_tdsops, only: tdsops_t + + implicit none + + logical :: allpass = .true. + real(dp), allocatable, dimension(:, :, :) :: u, v, r_u + real(dp), allocatable, dimension(:, :, :) :: du, dud, d2u ! intermediate solution arrays + real(dp), allocatable, dimension(:, :, :) :: & du_recv_s, du_recv_e, du_send_s, du_send_e, & dud_recv_s, dud_recv_e, dud_send_s, dud_send_e, & d2u_recv_s, d2u_recv_e, d2u_send_s, d2u_send_e - real(dp), allocatable, dimension(:, :, :) :: & + real(dp), allocatable, dimension(:, :, :) :: & u_send_s, u_send_e, u_recv_s, u_recv_e, & v_send_s, v_send_e, v_recv_s, v_recv_e - type(tdsops_t) :: der1st, der2nd + type(tdsops_t) :: der1st, der2nd - integer :: n, n_block, i, j, k, n_halo, n_iters - integer :: n_glob - integer :: nrank, nproc, pprev, pnext - integer :: ierr + integer :: n, n_block, i, j, k, n_halo, n_iters + integer :: n_glob + integer :: nrank, nproc, pprev, pnext + integer :: ierr - real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8 + real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8 - call MPI_Init(ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' - pnext = modulo(nrank - nproc + 1, nproc) - pprev = modulo(nrank - 1, nproc) + pnext = modulo(nrank - nproc + 1, nproc) + pprev = modulo(nrank - 1, nproc) - n_glob = 32*4 - n = n_glob/nproc - n_block = 32*32/SZ - n_iters = 1 + n_glob = 32*4 + n = n_glob/nproc + n_block = 32*32/SZ + n_iters = 1 - nu = 1._dp + nu = 1._dp - allocate(u(SZ, n, n_block), v(SZ, n, n_block), r_u(SZ, n, n_block)) + allocate(u(SZ, n, n_block), v(SZ, n, n_block), r_u(SZ, n, n_block)) - ! main input fields - ! field for storing the result - ! intermediate solution fields - allocate(du(SZ, n, n_block)) - allocate(dud(SZ, n, n_block)) - allocate(d2u(SZ, n, n_block)) + ! main input fields + ! field for storing the result + ! intermediate solution fields + allocate(du(SZ, n, n_block)) + allocate(dud(SZ, n, n_block)) + allocate(d2u(SZ, n, n_block)) - dx_per = 2*pi/n_glob - dx = 2*pi/(n_glob - 1) + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) - do k = 1, n_block + do k = 1, n_block do j = 1, n - do i = 1, SZ - u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) - v(i, j, k) = cos((j - 1 + nrank*n)*dx_per) - end do + do i = 1, SZ + u(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + v(i, j, k) = cos((j - 1 + nrank*n)*dx_per) + end do end do - end do - - n_halo = 4 - - ! arrays for exchanging data between ranks - allocate(u_send_s(SZ, n_halo, n_block)) - allocate(u_send_e(SZ, n_halo, n_block)) - allocate(u_recv_s(SZ, n_halo, n_block)) - allocate(u_recv_e(SZ, n_halo, n_block)) - allocate(v_send_s(SZ, n_halo, n_block)) - allocate(v_send_e(SZ, n_halo, n_block)) - allocate(v_recv_s(SZ, n_halo, n_block)) - allocate(v_recv_e(SZ, n_halo, n_block)) - - allocate(du_send_s(SZ, 1, n_block), du_send_e(SZ, 1, n_block)) - allocate(du_recv_s(SZ, 1, n_block), du_recv_e(SZ, 1, n_block)) - allocate(dud_send_s(SZ, 1, n_block), dud_send_e(SZ, 1, n_block)) - allocate(dud_recv_s(SZ, 1, n_block), dud_recv_e(SZ, 1, n_block)) - allocate(d2u_send_s(SZ, 1, n_block), d2u_send_e(SZ, 1, n_block)) - allocate(d2u_recv_s(SZ, 1, n_block), d2u_recv_e(SZ, 1, n_block)) - - ! preprocess the operator and coefficient arrays - der1st = tdsops_t(n, dx_per, operation='first-deriv', & - scheme='compact6') - der2nd = tdsops_t(n, dx_per, operation='second-deriv', & - scheme='compact6') - - - u_send_s(:, :, :) = u(:, 1:4, :) - u_send_e(:, :, :) = u(:, n - n_halo + 1:n, :) - v_send_s(:, :, :) = v(:, 1:4, :) - v_send_e(:, :, :) = v(:, n - n_halo + 1:n, :) - - - ! halo exchange - call sendrecv_fields(u_recv_s, u_recv_e, & - u_send_s, u_send_e, & - SZ*4*n_block, nproc, pprev, pnext) - - call sendrecv_fields(v_recv_s, v_recv_e, & - v_send_s, v_send_e, & - SZ*4*n_block, nproc, pprev, pnext) - - call exec_dist_transeq_compact( & + end do + + n_halo = 4 + + ! arrays for exchanging data between ranks + allocate(u_send_s(SZ, n_halo, n_block)) + allocate(u_send_e(SZ, n_halo, n_block)) + allocate(u_recv_s(SZ, n_halo, n_block)) + allocate(u_recv_e(SZ, n_halo, n_block)) + allocate(v_send_s(SZ, n_halo, n_block)) + allocate(v_send_e(SZ, n_halo, n_block)) + allocate(v_recv_s(SZ, n_halo, n_block)) + allocate(v_recv_e(SZ, n_halo, n_block)) + + allocate(du_send_s(SZ, 1, n_block), du_send_e(SZ, 1, n_block)) + allocate(du_recv_s(SZ, 1, n_block), du_recv_e(SZ, 1, n_block)) + allocate(dud_send_s(SZ, 1, n_block), dud_send_e(SZ, 1, n_block)) + allocate(dud_recv_s(SZ, 1, n_block), dud_recv_e(SZ, 1, n_block)) + allocate(d2u_send_s(SZ, 1, n_block), d2u_send_e(SZ, 1, n_block)) + allocate(d2u_recv_s(SZ, 1, n_block), d2u_recv_e(SZ, 1, n_block)) + + ! preprocess the operator and coefficient arrays + der1st = tdsops_t(n, dx_per, operation='first-deriv', & + scheme='compact6') + der2nd = tdsops_t(n, dx_per, operation='second-deriv', & + scheme='compact6') + + + u_send_s(:, :, :) = u(:, 1:4, :) + u_send_e(:, :, :) = u(:, n - n_halo + 1:n, :) + v_send_s(:, :, :) = v(:, 1:4, :) + v_send_e(:, :, :) = v(:, n - n_halo + 1:n, :) + + + ! halo exchange + call sendrecv_fields(u_recv_s, u_recv_e, & + u_send_s, u_send_e, & + SZ*4*n_block, nproc, pprev, pnext) + + call sendrecv_fields(v_recv_s, v_recv_e, & + v_send_s, v_send_e, & + SZ*4*n_block, nproc, pprev, pnext) + + call exec_dist_transeq_compact( & r_u, & du, dud, d2u, & du_send_s, du_send_e, du_recv_s, du_recv_e, & @@ -118,34 +118,34 @@ program test_transeq u, u_recv_s, u_recv_e, & v, v_recv_s, v_recv_e, & der1st, der1st, der2nd, nu, nproc, pprev, pnext, n_block & - ) + ) - ! check error - r_u = r_u - (-v*v + 0.5_dp*u*u - nu*u) - norm_du = norm2(r_u) - norm_du = norm_du*norm_du/n_glob/n_block/SZ - call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & - MPI_SUM, MPI_COMM_WORLD, ierr) - norm_du = sqrt(norm_du) + ! check error + r_u = r_u - (-v*v + 0.5_dp*u*u - nu*u) + norm_du = norm2(r_u) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) - if (nrank == 0) print*, 'error norm', norm_du + if (nrank == 0) print*, 'error norm', norm_du - if (nrank == 0) then + if (nrank == 0) then if ( norm_du > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check second derivatives... failed' + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' else - write(stderr, '(a)') 'Check second derivatives... passed' + write(stderr, '(a)') 'Check second derivatives... passed' end if - end if + end if - if (allpass) then + if (allpass) then if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else + else error stop 'SOME TESTS FAILED.' - end if + end if - call MPI_Finalize(ierr) + call MPI_Finalize(ierr) -end program test_transeq + end program test_transeq diff --git a/tests/omp/test_omp_transeq.f90 b/tests/omp/test_omp_transeq.f90 index f385fd64..c75ad6ac 100644 --- a/tests/omp/test_omp_transeq.f90 +++ b/tests/omp/test_omp_transeq.f90 @@ -1,158 +1,158 @@ -program test_omp_transeq - use iso_fortran_env, only: stderr => error_unit - use mpi - - use m_allocator, only: allocator_t, field_t - use m_common, only: dp, pi, globs_t, set_pprev_pnext, DIR_X, DIR_Y, DIR_Z - use m_omp_common, only: SZ - use m_omp_sendrecv, only: sendrecv_fields - use m_omp_backend, only: omp_backend_t, transeq_x_omp, base_backend_t - use m_tdsops, only: dirps_t, tdsops_t - use m_solver, only: allocate_tdsops - - implicit none - - logical :: allpass = .true. - class(field_t), pointer :: u, v, w - class(field_t), pointer :: du, dv, dw - real(dp), dimension(:, :, :), allocatable :: r_u - - integer :: n, n_block, i, j, k - integer :: n_glob - integer :: nrank, nproc - integer :: ierr - - real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8, tstart, tend - - type(globs_t) :: globs - class(base_backend_t), pointer :: backend - class(allocator_t), pointer :: allocator - - type(omp_backend_t), target :: omp_backend - type(allocator_t), target :: omp_allocator - type(dirps_t) :: xdirps, ydirps, zdirps - - ! Initialise variables and arrays - call MPI_Init(ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - - globs%nx = 96 - globs%ny = 96 - globs%nz = 96 - - globs%nx_loc = globs%nx/nproc - globs%ny_loc = globs%ny/nproc - globs%nz_loc = globs%nz/nproc - - globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ - globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ - globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ - - xdirps%nproc = nproc - ydirps%nproc = nproc - zdirps%nproc = nproc - - call set_pprev_pnext( & + program test_omp_transeq + use iso_fortran_env, only: stderr => error_unit + use mpi + + use m_allocator, only: allocator_t, field_t + use m_common, only: dp, pi, globs_t, set_pprev_pnext, DIR_X, DIR_Y, DIR_Z + use m_omp_common, only: SZ + use m_omp_sendrecv, only: sendrecv_fields + use m_omp_backend, only: omp_backend_t, transeq_x_omp, base_backend_t + use m_tdsops, only: dirps_t, tdsops_t + use m_solver, only: allocate_tdsops + + implicit none + + logical :: allpass = .true. + class(field_t), pointer :: u, v, w + class(field_t), pointer :: du, dv, dw + real(dp), dimension(:, :, :), allocatable :: r_u + + integer :: n, n_block, i, j, k + integer :: n_glob + integer :: nrank, nproc + integer :: ierr + + real(dp) :: dx, dx_per, nu, norm_du, tol = 1d-8, tstart, tend + + type(globs_t) :: globs + class(base_backend_t), pointer :: backend + class(allocator_t), pointer :: allocator + + type(omp_backend_t), target :: omp_backend + type(allocator_t), target :: omp_allocator + type(dirps_t) :: xdirps, ydirps, zdirps + + ! Initialise variables and arrays + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + + globs%nx = 96 + globs%ny = 96 + globs%nz = 96 + + globs%nx_loc = globs%nx/nproc + globs%ny_loc = globs%ny/nproc + globs%nz_loc = globs%nz/nproc + + globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ + globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ + globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ + + xdirps%nproc = nproc + ydirps%nproc = nproc + zdirps%nproc = nproc + + call set_pprev_pnext( & xdirps%pprev, xdirps%pnext, & ydirps%pprev, ydirps%pnext, & zdirps%pprev, zdirps%pnext, & xdirps%nproc, ydirps%nproc, zdirps%nproc, nrank & - ) - - xdirps%n = globs%nx_loc - ydirps%n = globs%ny_loc - zdirps%n = globs%nz_loc + ) - xdirps%n_blocks = globs%n_groups_x - ydirps%n_blocks = globs%n_groups_y - zdirps%n_blocks = globs%n_groups_z + xdirps%n = globs%nx_loc + ydirps%n = globs%ny_loc + zdirps%n = globs%nz_loc - xdirps%dir = DIR_X - ydirps%dir = DIR_Y - zdirps%dir = DIR_Z + xdirps%n_blocks = globs%n_groups_x + ydirps%n_blocks = globs%n_groups_y + zdirps%n_blocks = globs%n_groups_z - omp_allocator = allocator_t(xdirps%n, ydirps%n, zdirps%n, SZ) - allocator => omp_allocator - print*, 'OpenMP allocator instantiated' + xdirps%dir = DIR_X + ydirps%dir = DIR_Y + zdirps%dir = DIR_Z - omp_backend = omp_backend_t(globs, allocator) - backend => omp_backend - print*, 'OpenMP backend instantiated' + omp_allocator = allocator_t(xdirps%n, ydirps%n, zdirps%n, SZ) + allocator => omp_allocator + print*, 'OpenMP allocator instantiated' + omp_backend = omp_backend_t(globs, allocator) + backend => omp_backend + print*, 'OpenMP backend instantiated' - if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' - n_glob = globs%nx - n = n_glob/nproc - n_block = xdirps%n_blocks + if (nrank == 0) print*, 'Parallel run with', nproc, 'ranks' - nu = 1._dp - omp_backend%nu = nu + n_glob = globs%nx + n = n_glob/nproc + n_block = xdirps%n_blocks - - u => allocator%get_block(DIR_X) - v => allocator%get_block(DIR_X) - w => allocator%get_block(DIR_X) + nu = 1._dp + omp_backend%nu = nu - du => allocator%get_block(DIR_X) - dv => allocator%get_block(DIR_X) - dw => allocator%get_block(DIR_X) - dx_per = 2*pi/n_glob - dx = 2*pi/(n_glob - 1) - globs%dx = dx + u => allocator%get_block(DIR_X) + v => allocator%get_block(DIR_X) + w => allocator%get_block(DIR_X) + du => allocator%get_block(DIR_X) + dv => allocator%get_block(DIR_X) + dw => allocator%get_block(DIR_X) - do k = 1, n_block + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) + globs%dx = dx + + + do k = 1, n_block do j = 1, n - do i = 1, SZ - u%data(i, j, k) = sin((j - 1 + nrank*n)*dx_per) - v%data(i, j, k) = cos((j - 1 + nrank*n)*dx_per) - end do + do i = 1, SZ + u%data(i, j, k) = sin((j - 1 + nrank*n)*dx_per) + v%data(i, j, k) = cos((j - 1 + nrank*n)*dx_per) + end do end do - end do - w%data(:, :, :) = 0.d0 + end do + w%data(:, :, :) = 0.d0 - call allocate_tdsops(xdirps, globs%nx_loc, dx_per, omp_backend) + call allocate_tdsops(xdirps, globs%nx_loc, dx_per, omp_backend) - call cpu_time(tstart) - call transeq_x_omp(omp_backend, du, dv, dw, u, v, w, xdirps) - call cpu_time(tend) + call cpu_time(tstart) + call transeq_x_omp(omp_backend, du, dv, dw, u, v, w, xdirps) + call cpu_time(tend) - if (nrank == 0) print*, 'Total time', tend - tstart + if (nrank == 0) print*, 'Total time', tend - tstart - allocate(r_u(SZ, n, n_block)) + allocate(r_u(SZ, n, n_block)) - ! check error - ! dv = -1/2*(u*dv/dx + d(u*v)/dx) + nu*d2v/dx2 - ! u is sin, v is cos; - ! dv = -1/2*(u*(-u) + v*v + u*(-u)) + nu*(-v) - ! = u*u - 1/2*v*v - nu*v - r_u = dv%data - (u%data*u%data - 0.5_dp*v%data*v%data - nu*v%data) - norm_du = norm2(r_u) - norm_du = norm_du*norm_du/n_glob/n_block/SZ - call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & - MPI_SUM, MPI_COMM_WORLD, ierr) - norm_du = sqrt(norm_du) + ! check error + ! dv = -1/2*(u*dv/dx + d(u*v)/dx) + nu*d2v/dx2 + ! u is sin, v is cos; + ! dv = -1/2*(u*(-u) + v*v + u*(-u)) + nu*(-v) + ! = u*u - 1/2*v*v - nu*v + r_u = dv%data - (u%data*u%data - 0.5_dp*v%data*v%data - nu*v%data) + norm_du = norm2(r_u) + norm_du = norm_du*norm_du/n_glob/n_block/SZ + call MPI_Allreduce(MPI_IN_PLACE, norm_du, 1, MPI_DOUBLE_PRECISION, & + MPI_SUM, MPI_COMM_WORLD, ierr) + norm_du = sqrt(norm_du) - if (nrank == 0) print*, 'error norm', norm_du - if (nrank == 0) then + if (nrank == 0) print*, 'error norm', norm_du + if (nrank == 0) then if ( norm_du > tol ) then - allpass = .false. - write(stderr, '(a)') 'Check second derivatives... failed' + allpass = .false. + write(stderr, '(a)') 'Check second derivatives... failed' else - write(stderr, '(a)') 'Check second derivatives... passed' + write(stderr, '(a)') 'Check second derivatives... passed' end if - end if + end if - if (allpass) then + if (allpass) then if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else + else error stop 'SOME TESTS FAILED.' - end if + end if - call MPI_Finalize(ierr) + call MPI_Finalize(ierr) -end program test_omp_transeq + end program test_omp_transeq diff --git a/tests/omp/test_omp_tridiag.f90 b/tests/omp/test_omp_tridiag.f90 index 41b706e9..0e86cc7c 100644 --- a/tests/omp/test_omp_tridiag.f90 +++ b/tests/omp/test_omp_tridiag.f90 @@ -1,268 +1,268 @@ -program test_omp_tridiag - use iso_fortran_env, only: stderr => error_unit - use mpi - use omp_lib + program test_omp_tridiag + use iso_fortran_env, only: stderr => error_unit + use mpi + use omp_lib - use m_common, only: dp, pi - use m_omp_common, only: SZ - use m_omp_sendrecv, only: sendrecv_fields - use m_omp_exec_dist, only: exec_dist_tds_compact + use m_common, only: dp, pi + use m_omp_common, only: SZ + use m_omp_sendrecv, only: sendrecv_fields + use m_omp_exec_dist, only: exec_dist_tds_compact - use m_tdsops, only: tdsops_t, tdsops_init + use m_tdsops, only: tdsops_t, tdsops_init - implicit none + implicit none - logical :: allpass = .true. + logical :: allpass = .true. - real(dp), allocatable, dimension(:, :, :) :: u, du - real(dp), allocatable, dimension(:, :, :) :: u_recv_s, u_recv_e, & - u_send_s, u_send_e + real(dp), allocatable, dimension(:, :, :) :: u, du + real(dp), allocatable, dimension(:, :, :) :: u_recv_s, u_recv_e, & + u_send_s, u_send_e - real(dp), allocatable, dimension(:, :, :) :: send_s, send_e, & - recv_s, recv_e + real(dp), allocatable, dimension(:, :, :) :: send_s, send_e, & + recv_s, recv_e - real(dp), allocatable, dimension(:) :: sin_0_2pi_per, cos_0_2pi_per, & - sin_0_2pi, cos_0_2pi, & - sin_stag, cos_stag + real(dp), allocatable, dimension(:) :: sin_0_2pi_per, cos_0_2pi_per, & + sin_0_2pi, cos_0_2pi, & + sin_stag, cos_stag - type(tdsops_t) :: tdsops + type(tdsops_t) :: tdsops - character(len=20) :: bc_start, bc_end + character(len=20) :: bc_start, bc_end - integer :: n, n_block, j, n_halo, n_iters, n_loc - integer :: n_glob - integer :: nrank, nproc, pprev, pnext - integer :: ierr, memClockRt, memBusWidth + integer :: n, n_block, j, n_halo, n_iters, n_loc + integer :: n_glob + integer :: nrank, nproc, pprev, pnext + integer :: ierr, memClockRt, memBusWidth - real(dp) :: dx, dx_per, norm_du, tol = 1d-8, tstart, tend - real(dp) :: achievedBW, deviceBW, achievedBWmax, achievedBWmin + real(dp) :: dx, dx_per, norm_du, tol = 1d-8, tstart, tend + real(dp) :: achievedBW, deviceBW, achievedBWmax, achievedBWmin - call MPI_Init(ierr) - call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) - call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) + call MPI_Init(ierr) + call MPI_Comm_rank(MPI_COMM_WORLD, nrank, ierr) + call MPI_Comm_size(MPI_COMM_WORLD, nproc, ierr) - if (nrank == 0) print *, 'Parallel run with', nproc, 'ranks' + if (nrank == 0) print *, 'Parallel run with', nproc, 'ranks' - pnext = modulo(nrank - nproc + 1, nproc) - pprev = modulo(nrank - 1, nproc) + pnext = modulo(nrank - nproc + 1, nproc) + pprev = modulo(nrank - 1, nproc) - n_glob = 1024 - n = n_glob/nproc - n_block = 64*64/SZ - n_iters = 1 + n_glob = 1024 + n = n_glob/nproc + n_block = 64*64/SZ + n_iters = 1 - allocate (u(SZ, n, n_block), du(SZ, n, n_block)) + allocate (u(SZ, n, n_block), du(SZ, n, n_block)) - dx_per = 2*pi/n_glob - dx = 2*pi/(n_glob - 1) + dx_per = 2*pi/n_glob + dx = 2*pi/(n_glob - 1) - allocate (sin_0_2pi_per(n), cos_0_2pi_per(n)) - allocate (sin_0_2pi(n), cos_0_2pi(n)) - allocate (sin_stag(n), cos_stag(n)) - do j = 1, n + allocate (sin_0_2pi_per(n), cos_0_2pi_per(n)) + allocate (sin_0_2pi(n), cos_0_2pi(n)) + allocate (sin_stag(n), cos_stag(n)) + do j = 1, n sin_0_2pi_per(j) = sin(((j - 1) + nrank*n)*dx_per) cos_0_2pi_per(j) = cos(((j - 1) + nrank*n)*dx_per) sin_0_2pi(j) = sin(((j - 1) + nrank*n)*dx) cos_0_2pi(j) = cos(((j - 1) + nrank*n)*dx) sin_stag(j) = sin(((j - 1) + nrank*n)*dx + dx/2._dp) cos_stag(j) = cos(((j - 1) + nrank*n)*dx + dx/2._dp) - end do + end do - n_halo = 4 + n_halo = 4 - ! arrays for exchanging data between ranks - allocate (u_send_s(SZ, n_halo, n_block)) - allocate (u_send_e(SZ, n_halo, n_block)) - allocate (u_recv_s(SZ, n_halo, n_block)) - allocate (u_recv_e(SZ, n_halo, n_block)) + ! arrays for exchanging data between ranks + allocate (u_send_s(SZ, n_halo, n_block)) + allocate (u_send_e(SZ, n_halo, n_block)) + allocate (u_recv_s(SZ, n_halo, n_block)) + allocate (u_recv_e(SZ, n_halo, n_block)) - allocate (send_s(SZ, 1, n_block), send_e(SZ, 1, n_block)) - allocate (recv_s(SZ, 1, n_block), recv_e(SZ, 1, n_block)) + allocate (send_s(SZ, 1, n_block), send_e(SZ, 1, n_block)) + allocate (recv_s(SZ, 1, n_block), recv_e(SZ, 1, n_block)) - ! ========================================================================= - ! second derivative with periodic BC - tdsops = tdsops_init(n, dx_per, operation='second-deriv', scheme='compact6') + ! ========================================================================= + ! second derivative with periodic BC + tdsops = tdsops_init(n, dx_per, operation='second-deriv', scheme='compact6') - call set_u(u, sin_0_2pi_per, n, n_block) + call set_u(u, sin_0_2pi_per, n, n_block) - tstart = omp_get_wtime() + tstart = omp_get_wtime() - call run_kernel(n_iters, n_block, u, du, tdsops, n, & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - recv_s, recv_e, send_s, send_e, & - nproc, pprev, pnext & - ) + call run_kernel(n_iters, n_block, u, du, tdsops, n, & + u_recv_s, u_recv_e, u_send_s, u_send_e, & + recv_s, recv_e, send_s, send_e, & + nproc, pprev, pnext & + ) - tend = omp_get_wtime() - if (nrank == 0) print *, 'Total time', tend - tstart + tend = omp_get_wtime() + if (nrank == 0) print *, 'Total time', tend - tstart - call check_error_norm(du, sin_0_2pi_per, n, n_glob, n_block, 1, norm_du) - if (nrank == 0) print *, 'error norm second-deriv periodic', norm_du + call check_error_norm(du, sin_0_2pi_per, n, n_glob, n_block, 1, norm_du) + if (nrank == 0) print *, 'error norm second-deriv periodic', norm_du - if (nrank == 0) then + if (nrank == 0) then if (norm_du > tol) then - allpass = .false. - write (stderr, '(a)') 'Check 2nd derivatives, periodic BCs... failed' + allpass = .false. + write (stderr, '(a)') 'Check 2nd derivatives, periodic BCs... failed' else - write (stderr, '(a)') 'Check 2nd derivatives, periodic BCs... passed' + write (stderr, '(a)') 'Check 2nd derivatives, periodic BCs... passed' end if - end if + end if - ! ========================================================================= - ! first derivative with periodic BC - tdsops = tdsops_init(n, dx_per, operation='first-deriv', scheme='compact6') + ! ========================================================================= + ! first derivative with periodic BC + tdsops = tdsops_init(n, dx_per, operation='first-deriv', scheme='compact6') - call set_u(u, sin_0_2pi_per, n, n_block) + call set_u(u, sin_0_2pi_per, n, n_block) - call run_kernel(n_iters, n_block, u, du, tdsops, n, & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - recv_s, recv_e, send_s, send_e, & - nproc, pprev, pnext & - ) + call run_kernel(n_iters, n_block, u, du, tdsops, n, & + u_recv_s, u_recv_e, u_send_s, u_send_e, & + recv_s, recv_e, send_s, send_e, & + nproc, pprev, pnext & + ) - call check_error_norm(du, cos_0_2pi_per, n, n_glob, n_block, -1, norm_du) - if (nrank == 0) print *, 'error norm first-deriv periodic', norm_du + call check_error_norm(du, cos_0_2pi_per, n, n_glob, n_block, -1, norm_du) + if (nrank == 0) print *, 'error norm first-deriv periodic', norm_du - if (nrank == 0) then + if (nrank == 0) then if (norm_du > tol) then - allpass = .false. - write (stderr, '(a)') 'Check 1st derivatives, periodic BCs... failed' + allpass = .false. + write (stderr, '(a)') 'Check 1st derivatives, periodic BCs... failed' else - write (stderr, '(a)') 'Check 1st derivatives, periodic BCs... passed' + write (stderr, '(a)') 'Check 1st derivatives, periodic BCs... passed' end if - end if + end if - ! ========================================================================= - ! first derivative with dirichlet and neumann - if (nrank == 0) then + ! ========================================================================= + ! first derivative with dirichlet and neumann + if (nrank == 0) then bc_start = 'dirichlet'!'neumann'!'dirichlet' - else + else bc_start = 'null' - end if - if (nrank == nproc - 1) then + end if + if (nrank == nproc - 1) then bc_end = 'neumann'!'dirichlet'!'neumann' - else + else bc_end = 'null' - end if + end if - tdsops = tdsops_init(n, dx, operation='first-deriv', scheme='compact6', & - bc_start=trim(bc_start), bc_end=trim(bc_end), & - sym=.false.) + tdsops = tdsops_init(n, dx, operation='first-deriv', scheme='compact6', & + bc_start=trim(bc_start), bc_end=trim(bc_end), & + sym=.false.) - call set_u(u, sin_0_2pi, n, n_block) + call set_u(u, sin_0_2pi, n, n_block) - call run_kernel(n_iters, n_block, u, du, tdsops, n, & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - recv_s, recv_e, send_s, send_e, & - nproc, pprev, pnext & - ) + call run_kernel(n_iters, n_block, u, du, tdsops, n, & + u_recv_s, u_recv_e, u_send_s, u_send_e, & + recv_s, recv_e, send_s, send_e, & + nproc, pprev, pnext & + ) - call check_error_norm(du, cos_0_2pi, n, n_glob, n_block, -1, norm_du) - if (nrank == 0) print *, 'error norm first deriv dir-neu', norm_du + call check_error_norm(du, cos_0_2pi, n, n_glob, n_block, -1, norm_du) + if (nrank == 0) print *, 'error norm first deriv dir-neu', norm_du - if (nrank == 0) then + if (nrank == 0) then if (norm_du > tol) then - allpass = .false. - write (stderr, '(a)') 'Check 1st derivatives, dir-neu... failed' + allpass = .false. + write (stderr, '(a)') 'Check 1st derivatives, dir-neu... failed' else - write (stderr, '(a)') 'Check 1st derivatives, dir-neu... passed' + write (stderr, '(a)') 'Check 1st derivatives, dir-neu... passed' end if - end if + end if - ! ========================================================================= - ! stag interpolate with neumann sym - n_loc = n - if (nrank == nproc - 1) n_loc = n - 1 - tdsops = tdsops_init(n_loc, dx, operation='interpolate', scheme='classic', & - bc_start=trim(bc_start), bc_end=trim(bc_end), & - from_to='v2p') + ! ========================================================================= + ! stag interpolate with neumann sym + n_loc = n + if (nrank == nproc - 1) n_loc = n - 1 + tdsops = tdsops_init(n_loc, dx, operation='interpolate', scheme='classic', & + bc_start=trim(bc_start), bc_end=trim(bc_end), & + from_to='v2p') - call set_u(u, cos_0_2pi, n, n_block) + call set_u(u, cos_0_2pi, n, n_block) - call run_kernel(n_iters, n_block, u, du, tdsops, n_loc, & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - recv_s, recv_e, send_s, send_e, & - nproc, pprev, pnext & - ) + call run_kernel(n_iters, n_block, u, du, tdsops, n_loc, & + u_recv_s, u_recv_e, u_send_s, u_send_e, & + recv_s, recv_e, send_s, send_e, & + nproc, pprev, pnext & + ) - call check_error_norm(du, cos_stag, n_loc, n_glob, n_block, -1, norm_du) - if (nrank == 0) print *, 'error norm interpolate', norm_du + call check_error_norm(du, cos_stag, n_loc, n_glob, n_block, -1, norm_du) + if (nrank == 0) print *, 'error norm interpolate', norm_du - if (nrank == 0) then + if (nrank == 0) then if (norm_du > tol) then - allpass = .false. - write (stderr, '(a)') 'Check interpolation... failed' + allpass = .false. + write (stderr, '(a)') 'Check interpolation... failed' else - write (stderr, '(a)') 'Check interpolation... passed' + write (stderr, '(a)') 'Check interpolation... passed' end if - end if + end if - ! ========================================================================= - ! second derivative and hyperviscousity on with dirichlet and neumann - ! c_nu = 0.22 and nu0_nu = 63 results in alpha = 0.40869111947709036 - tdsops = tdsops_init(n, dx, operation='second-deriv', & - scheme='compact6-hyperviscous', & - bc_start=trim(bc_start), bc_end=trim(bc_end), & - sym=.false., c_nu=0.22_dp, nu0_nu=63._dp) + ! ========================================================================= + ! second derivative and hyperviscousity on with dirichlet and neumann + ! c_nu = 0.22 and nu0_nu = 63 results in alpha = 0.40869111947709036 + tdsops = tdsops_init(n, dx, operation='second-deriv', & + scheme='compact6-hyperviscous', & + bc_start=trim(bc_start), bc_end=trim(bc_end), & + sym=.false., c_nu=0.22_dp, nu0_nu=63._dp) - call set_u(u, sin_0_2pi, n, n_block) + call set_u(u, sin_0_2pi, n, n_block) - call run_kernel(n_iters, n_block, u, du, tdsops, n, & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - recv_s, recv_e, send_s, send_e, & - nproc, pprev, pnext & - ) + call run_kernel(n_iters, n_block, u, du, tdsops, n, & + u_recv_s, u_recv_e, u_send_s, u_send_e, & + recv_s, recv_e, send_s, send_e, & + nproc, pprev, pnext & + ) - call check_error_norm(du, sin_0_2pi, n, n_glob, n_block, 1, norm_du) - if (nrank == 0) print *, 'error norm hyperviscous', norm_du + call check_error_norm(du, sin_0_2pi, n, n_glob, n_block, 1, norm_du) + if (nrank == 0) print *, 'error norm hyperviscous', norm_du - if (nrank == 0) then + if (nrank == 0) then if (norm_du > tol) then - allpass = .false. - write (stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... failed' + allpass = .false. + write (stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... failed' else - write (stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... passed' + write (stderr, '(a)') 'Check 2nd ders, hyperviscous, dir-neu... passed' end if - end if - - ! ========================================================================= - ! BW utilisation and performance checks - ! 3 in the first phase, 2 in the second phase, so 5 in total - achievedBW = 5._dp*n_iters*n*n_block*SZ*dp/(tend - tstart) - call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & - MPI_MAX, MPI_COMM_WORLD, ierr) - call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & - MPI_MIN, MPI_COMM_WORLD, ierr) - if (nrank == 0) then + end if + + ! ========================================================================= + ! BW utilisation and performance checks + ! 3 in the first phase, 2 in the second phase, so 5 in total + achievedBW = 5._dp*n_iters*n*n_block*SZ*dp/(tend - tstart) + call MPI_Allreduce(achievedBW, achievedBWmax, 1, MPI_DOUBLE_PRECISION, & + MPI_MAX, MPI_COMM_WORLD, ierr) + call MPI_Allreduce(achievedBW, achievedBWmin, 1, MPI_DOUBLE_PRECISION, & + MPI_MIN, MPI_COMM_WORLD, ierr) + if (nrank == 0) then print'(a, f8.3, a)', 'Achieved BW min: ', achievedBWmin/2**30, ' GiB/s' print'(a, f8.3, a)', 'Achieved BW max: ', achievedBWmax/2**30, ' GiB/s' - end if + end if - memClockRt = 3200 - memBusWidth = 64 - deviceBW = 2*memBusWidth/8._dp*memClockRt*1000000 + memClockRt = 3200 + memBusWidth = 64 + deviceBW = 2*memBusWidth/8._dp*memClockRt*1000000 - if (nrank == 0) then + if (nrank == 0) then print'(a, f8.3, a)', 'Available BW: ', deviceBW/2**30, & - ' GiB/s (per NUMA zone on ARCHER2)' + ' GiB/s (per NUMA zone on ARCHER2)' print'(a, f5.2)', 'Effective BW util min: %', achievedBWmin/deviceBW*100 print'(a, f5.2)', 'Effective BW util max: %', achievedBWmax/deviceBW*100 - end if + end if - if (allpass) then + if (allpass) then if (nrank == 0) write (stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else + else error stop 'SOME TESTS FAILED.' - end if + end if - call MPI_Finalize(ierr) + call MPI_Finalize(ierr) -contains + contains - subroutine run_kernel(n_iters, n_block, u, du, tdsops, n, & - u_recv_s, u_recv_e, u_send_s, u_send_e, & - recv_s, recv_e, send_s, send_e, & - nproc, pprev, pnext) + subroutine run_kernel(n_iters, n_block, u, du, tdsops, n, & + u_recv_s, u_recv_e, u_send_s, u_send_e, & + recv_s, recv_e, send_s, send_e, & + nproc, pprev, pnext) implicit none integer, intent(in) :: n_iters, n_block @@ -271,39 +271,39 @@ subroutine run_kernel(n_iters, n_block, u, du, tdsops, n, & type(tdsops_t), intent(in) :: tdsops integer, intent(in) :: n real(dp), intent(inout), dimension(:, :, :) :: u_recv_s, u_recv_e, & - u_send_s, u_send_e + u_send_s, u_send_e real(dp), intent(inout), dimension(:, :, :) :: recv_s, recv_e, & - send_s, send_e + send_s, send_e integer, intent(in) :: nproc, pprev, pnext integer :: iters, i, j, k do iters = 1, n_iters - ! first copy halo data into buffers - !$omp parallel do - do k = 1, n_block - do j = 1, 4 - !$omp simd - do i = 1, SZ - u_send_s(i, j, k) = u(i, j, k) - u_send_e(i, j, k) = u(i, n - n_halo + j, k) - end do - !$omp end simd + ! first copy halo data into buffers + !$omp parallel do + do k = 1, n_block + do j = 1, 4 + !$omp simd + do i = 1, SZ + u_send_s(i, j, k) = u(i, j, k) + u_send_e(i, j, k) = u(i, n - n_halo + j, k) end do - end do - !$omp end parallel do + !$omp end simd + end do + end do + !$omp end parallel do - ! halo exchange - call sendrecv_fields(u_recv_s, u_recv_e, u_send_s, u_send_e, & - SZ*n_halo*n_block, nproc, pprev, pnext) + ! halo exchange + call sendrecv_fields(u_recv_s, u_recv_e, u_send_s, u_send_e, & + SZ*n_halo*n_block, nproc, pprev, pnext) call exec_dist_tds_compact(du, u, u_recv_s, u_recv_e, send_s, send_e, & - recv_s, recv_e, tdsops, nproc, pprev, pnext, n_block) + recv_s, recv_e, tdsops, nproc, pprev, pnext, n_block) end do - end subroutine run_kernel + end subroutine run_kernel - subroutine set_u(u, line, n, n_block) + subroutine set_u(u, line, n, n_block) implicit none real(dp), intent(out), dimension(:, :, :) :: u @@ -313,16 +313,16 @@ subroutine set_u(u, line, n, n_block) integer :: i, j, k do k = 1, n_block - do j = 1, n - do i = 1, SZ - u(i, j, k) = line(j) - end do - end do + do j = 1, n + do i = 1, SZ + u(i, j, k) = line(j) + end do + end do end do - end subroutine set_u + end subroutine set_u - subroutine check_error_norm(du, line, n, n_glob, n_block, c, norm) + subroutine check_error_norm(du, line, n, n_glob, n_block, c, norm) implicit none real(dp), intent(inout), dimension(:, :, :) :: du @@ -333,20 +333,20 @@ subroutine check_error_norm(du, line, n, n_glob, n_block, c, norm) integer :: i, j, k do k = 1, n_block - do j = 1, n - do i = 1, SZ - du(i, j, k) = du(i, j, k) + c*line(j) - end do - end do + do j = 1, n + do i = 1, SZ + du(i, j, k) = du(i, j, k) + c*line(j) + end do + end do end do norm = norm2(du(:, 1:n, :)) norm = norm*norm/n_glob/n_block/SZ call MPI_Allreduce(MPI_IN_PLACE, norm, 1, MPI_DOUBLE_PRECISION, & - MPI_SUM, MPI_COMM_WORLD, ierr) + MPI_SUM, MPI_COMM_WORLD, ierr) norm = sqrt(norm) - end subroutine check_error_norm + end subroutine check_error_norm -end program test_omp_tridiag + end program test_omp_tridiag diff --git a/tests/test_allocator.f90 b/tests/test_allocator.f90 index fdeef85a..b8f9f573 100644 --- a/tests/test_allocator.f90 +++ b/tests/test_allocator.f90 @@ -1,77 +1,77 @@ -program test_allocator - use iso_fortran_env, only: stderr => error_unit + program test_allocator + use iso_fortran_env, only: stderr => error_unit - use m_allocator, only: allocator_t, field_t - use m_common, only: DIR_X + use m_allocator, only: allocator_t, field_t + use m_common, only: DIR_X - implicit none + implicit none - logical :: allpass - integer, parameter :: dims(3) = [8, 8, 8] - class(allocator_t), allocatable :: allocator - class(field_t), pointer :: ptr1, ptr2, ptr3 - integer, allocatable :: l(:) + logical :: allpass + integer, parameter :: dims(3) = [8, 8, 8] + class(allocator_t), allocatable :: allocator + class(field_t), pointer :: ptr1, ptr2, ptr3 + integer, allocatable :: l(:) - allocator = allocator_t(dims(1), dims(2), dims(3), 8) + allocator = allocator_t(dims(1), dims(2), dims(3), 8) - allpass = .true. + allpass = .true. - ! Get the list of ids for free blocks. By default there are none - ! and returned list is [0]. - l = allocator%get_block_ids() - if (size(l) /= 1 .or. l(1) /= 0) then - allpass = .false. - write(stderr, '(a)') 'Free list is initialised empty... failed' - else - write(stderr, '(a)') 'Free list is initialised empty... passed' - end if + ! Get the list of ids for free blocks. By default there are none + ! and returned list is [0]. + l = allocator%get_block_ids() + if (size(l) /= 1 .or. l(1) /= 0) then + allpass = .false. + write(stderr, '(a)') 'Free list is initialised empty... failed' + else + write(stderr, '(a)') 'Free list is initialised empty... passed' + end if - ! Request two blocks and release them in reverse order. List should - ! contain two free blocks. (1 -> 2) - ptr1 => allocator%get_block(DIR_X) - ptr2 => allocator%get_block(DIR_X) - call allocator%release_block(ptr2) - call allocator%release_block(ptr1) + ! Request two blocks and release them in reverse order. List should + ! contain two free blocks. (1 -> 2) + ptr1 => allocator%get_block(DIR_X) + ptr2 => allocator%get_block(DIR_X) + call allocator%release_block(ptr2) + call allocator%release_block(ptr1) - if (.not. all(allocator%get_block_ids() .eq. [1, 2])) then - allpass = .false. - write(stderr, '(a)') 'Blocks are released correctly... failed' - else - write(stderr, '(a)') 'Blocks are released correctly... passed' - end if + if (.not. all(allocator%get_block_ids() .eq. [1, 2])) then + allpass = .false. + write(stderr, '(a)') 'Blocks are released correctly... failed' + else + write(stderr, '(a)') 'Blocks are released correctly... passed' + end if - !! Destroy the free list and check that the list is empty again. - call allocator%destroy() - l = allocator%get_block_ids() - if (size(l) /= 1 .or. l(1) /= 0 .or. allocator%next_id /=0) then - allpass = .false. - write(stderr, '(a)') 'Free list is correctly destroyed... failed' - else - write(stderr, '(a)') 'Free list is correctly destroyed... passed' - end if + !! Destroy the free list and check that the list is empty again. + call allocator%destroy() + l = allocator%get_block_ids() + if (size(l) /= 1 .or. l(1) /= 0 .or. allocator%next_id /=0) then + allpass = .false. + write(stderr, '(a)') 'Free list is correctly destroyed... failed' + else + write(stderr, '(a)') 'Free list is correctly destroyed... passed' + end if - ! Request a block from a list of three. This should grab the first - ! block on top of the pile and reduce the free list to two blocks. - ptr1 => allocator%get_block(DIR_X) - ptr2 => allocator%get_block(DIR_X) - ptr3 => allocator%get_block(DIR_X) - call allocator%release_block(ptr3) - call allocator%release_block(ptr2) - call allocator%release_block(ptr1) - ptr1 => allocator%get_block(DIR_X) + ! Request a block from a list of three. This should grab the first + ! block on top of the pile and reduce the free list to two blocks. + ptr1 => allocator%get_block(DIR_X) + ptr2 => allocator%get_block(DIR_X) + ptr3 => allocator%get_block(DIR_X) + call allocator%release_block(ptr3) + call allocator%release_block(ptr2) + call allocator%release_block(ptr1) + ptr1 => allocator%get_block(DIR_X) - if (.not. all(allocator%get_block_ids() .eq. [2, 3])) then - allpass = .false. - write(stderr, '(a)') 'Block is correctly allocated... failed' - else - write(stderr, '(a)') 'Block is correctly allocated... passed' - end if + if (.not. all(allocator%get_block_ids() .eq. [2, 3])) then + allpass = .false. + write(stderr, '(a)') 'Block is correctly allocated... failed' + else + write(stderr, '(a)') 'Block is correctly allocated... passed' + end if - call allocator%destroy() + call allocator%destroy() - if (allpass) then - write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' - else - error stop 'SOME TESTS FAILED.' - end if -end program test_allocator + if (allpass) then + write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + else + error stop 'SOME TESTS FAILED.' + end if + end program test_allocator diff --git a/tests/test_reordering.f90 b/tests/test_reordering.f90 index 52a4ddfc..7921cecb 100644 --- a/tests/test_reordering.f90 +++ b/tests/test_reordering.f90 @@ -1,4 +1,4 @@ -program test_reorder + program test_reorder use iso_fortran_env, only: stderr => error_unit use mpi @@ -7,20 +7,20 @@ program test_reorder use m_tdsops, only: dirps_t use m_common, only: dp, pi, globs_t, set_pprev_pnext, & - RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & - DIR_X, DIR_Y, DIR_Z + RDR_X2Y, RDR_X2Z, RDR_Y2X, RDR_Y2Z, RDR_Z2X, RDR_Z2Y, & + DIR_X, DIR_Y, DIR_Z use m_ordering, only: get_index_dir, get_index_ijk #ifdef CUDA - use cudafor + use cudafor - use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t - use m_cuda_backend, only: cuda_backend_t - use m_cuda_common, only: SZ + use m_cuda_allocator, only: cuda_allocator_t, cuda_field_t + use m_cuda_backend, only: cuda_backend_t + use m_cuda_common, only: SZ #else - use m_omp_common, only: SZ - use m_omp_backend, only: omp_backend_t + use m_omp_common, only: SZ + use m_omp_backend, only: omp_backend_t #endif implicit none @@ -63,15 +63,15 @@ program test_reorder ierr = cudaSetDevice(mod(nrank, ndevs)) ! round-robin ierr = cudaGetDevice(devnum) #endif - + globs%nx = 32 globs%ny = 64 globs%nz = 96 - + globs%nx_loc = globs%nx/nproc globs%ny_loc = globs%ny/nproc globs%nz_loc = globs%nz/nproc - + globs%n_groups_x = globs%ny_loc*globs%nz_loc/SZ globs%n_groups_y = globs%nx_loc*globs%nz_loc/SZ globs%n_groups_z = globs%nx_loc*globs%ny_loc/SZ @@ -83,8 +83,8 @@ program test_reorder xdirps%n_blocks = globs%n_groups_x ydirps%n_blocks = globs%n_groups_y zdirps%n_blocks = globs%n_groups_z - - + + #ifdef CUDA cuda_allocator = cuda_allocator_t(globs%nx_loc, globs%ny_loc, globs%nz_loc, SZ) allocator => cuda_allocator @@ -112,22 +112,22 @@ program test_reorder pass_Y = .true. pass_Z = .true. - ! Test indexing only + ! Test indexing only do k=1, zdirps%n - do j=1, ydirps%n - do i=1, xdirps%n - call test_index_reversing(pass_X, i, j, k, DIR_X, SZ, xdirps%n, ydirps%n, zdirps%n) - call test_index_reversing(pass_Y, i, j, k, DIR_Y, SZ, xdirps%n, ydirps%n, zdirps%n) - call test_index_reversing(pass_Z, i, j, k, DIR_Z, SZ, xdirps%n, ydirps%n, zdirps%n) - end do + do j=1, ydirps%n + do i=1, xdirps%n + call test_index_reversing(pass_X, i, j, k, DIR_X, SZ, xdirps%n, ydirps%n, zdirps%n) + call test_index_reversing(pass_Y, i, j, k, DIR_Y, SZ, xdirps%n, ydirps%n, zdirps%n) + call test_index_reversing(pass_Z, i, j, k, DIR_Z, SZ, xdirps%n, ydirps%n, zdirps%n) end do + end do end do if (.not. pass_X) print *, "Error in X direction for index reversing" if (.not. pass_Y) print *, "Error in Y direction for index reversing" if (.not. pass_Z) print *, "Error in Z direction for index reversing" allpass = (pass_X .and. pass_Y .and. pass_Z) - + ! Test reordering u_x => allocator%get_block(DIR_X) @@ -145,13 +145,13 @@ program test_reorder allocate (temp_2(dims(1), dims(2), dims(3))) select type (u_x_original) - type is (cuda_field_t) - u_x_original%data_d = u_array + type is (cuda_field_t) + u_x_original%data_d = u_array end select #else select type (u_x_original) - type is (field_t) - u_x_original%data = u_array + type is (field_t) + u_x_original%data = u_array end select #endif @@ -172,54 +172,54 @@ program test_reorder call check_reorder(allpass, u_x, u_x_original, "testing Z2Y and Y2X failed") if (allpass) then - if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' + if (nrank == 0) write(stderr, '(a)') 'ALL TESTS PASSED SUCCESSFULLY.' else - error stop 'SOME TESTS FAILED.' + error stop 'SOME TESTS FAILED.' end if call MPI_Finalize(ierr) - contains + contains subroutine test_index_reversing(pass, i, j, k, dir, SZ, nx, ny, nz) - logical, intent(inout) :: pass - integer, intent(in) :: i, j, k ! original indices in the cartesian space - integer, intent(in) :: dir - integer, intent(in) :: SZ, nx, ny, nz - integer :: dir_i, dir_j, dir_k ! indices in the applicatin storage direction - integer :: cart_i, cart_j, cart_k ! newly computed indices in the cartesian space + logical, intent(inout) :: pass + integer, intent(in) :: i, j, k ! original indices in the cartesian space + integer, intent(in) :: dir + integer, intent(in) :: SZ, nx, ny, nz + integer :: dir_i, dir_j, dir_k ! indices in the applicatin storage direction + integer :: cart_i, cart_j, cart_k ! newly computed indices in the cartesian space - call get_index_dir(dir_i, dir_j, dir_k, i, j, k, dir, SZ, nx, ny, nz) - call get_index_ijk(cart_i, cart_j, cart_k, dir_i, dir_j, dir_k, dir, SZ, nx, ny, nz) + call get_index_dir(dir_i, dir_j, dir_k, i, j, k, dir, SZ, nx, ny, nz) + call get_index_ijk(cart_i, cart_j, cart_k, dir_i, dir_j, dir_k, dir, SZ, nx, ny, nz) - if (i /= cart_i .or. j /= cart_j .or. k /= cart_k) then - pass = .false. - end if + if (i /= cart_i .or. j /= cart_j .or. k /= cart_k) then + pass = .false. + end if end subroutine subroutine check_reorder(allpass, a, b, message) - logical, intent(inout) :: allpass - class(field_t), intent(in) :: a, b - character(len=*), intent(in) :: message - real(dp) :: tol = 1d-8 + logical, intent(inout) :: allpass + class(field_t), intent(in) :: a, b + character(len=*), intent(in) :: message + real(dp) :: tol = 1d-8 #ifdef CUDA - select type (a); type is (cuda_field_t); temp_1 = a%data_d; end select - select type (b); type is (cuda_field_t); temp_2 = b%data_d; end select - if (norm2(temp_1 - temp_2) > tol) then - allpass = .false. - write(stderr, '(a)') message - end if + select type (a); type is (cuda_field_t); temp_1 = a%data_d; end select + select type (b); type is (cuda_field_t); temp_2 = b%data_d; end select + if (norm2(temp_1 - temp_2) > tol) then + allpass = .false. + write(stderr, '(a)') message + end if #else - if (norm2(a%data - b%data) > tol) then - allpass = .false. - write(stderr, '(a)') message - end if + if (norm2(a%data - b%data) > tol) then + allpass = .false. + write(stderr, '(a)') message + end if #endif end subroutine -end program test_reorder + end program test_reorder