From 9ed9a3104092ba59b554edeb3cc3db2bdd8fed11 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Mon, 13 Jul 2020 20:07:23 +0200 Subject: [PATCH 01/44] sharedmem: decomposition in y --- libmpdata++/concurr/detail/sharedmem.hpp | 52 +++++++++++++----------- 1 file changed, 28 insertions(+), 24 deletions(-) diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index cf927d6a..6a1a2aad 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -48,6 +48,10 @@ namespace libmpdataxx std::array grid_size; bool panic = false; // for multi-threaded SIGTERM handling + // 1D and 2D - domain decomposed in 0-th dimension (x) + // 3D - domain decomposed in 1-st dimension (y) for better workload balance in MPI runs (MPI is decomposed in x) + const int decomp_dim; + detail::distmem distmem; // TODO: these are public because used from outside in alloc - could friendship help? @@ -84,14 +88,14 @@ namespace libmpdataxx // ctors // TODO: fill reducetmp with NaNs (or use 1-element arrvec_t - it's NaN-filled by default) sharedmem_common(const std::array &grid_size, const int &size) - : n(0), distmem(grid_size), size(size) // TODO: is n(0) needed? + : n(0), distmem(grid_size), size(size), decomp_dim(n_dims < 3 ? 0 : 1) // TODO: is n(0) needed? { for (int d = 0; d < n_dims; ++d) { this->grid_size[d] = slab( rng_t(0, grid_size[d]-1), - d == 0 ? distmem.rank() : 0, - d == 0 ? distmem.size() : 1 + d == decomp_dim ? distmem.rank() : 0, + d == decomp_dim ? distmem.size() : 1 ); origin[d] = this->grid_size[d].first(); } @@ -104,7 +108,7 @@ namespace libmpdataxx throw std::runtime_error("number of subdomains greater than number of gridpoints"); if (n_dims != 1) - sumtmp.reset(new blitz::Array(this->grid_size[0])); + sumtmp.reset(new blitz::Array(this->grid_size[n_dims-2])); xtmtmp.reset(new blitz::Array(size)); } @@ -113,11 +117,11 @@ namespace libmpdataxx { // doing a two-step sum to reduce numerical error // and make parallel results reproducible - for (int c = ijk[0].first(); c <= ijk[0].last(); ++c) // TODO: optimise for i.count() == 1 + for (int c = ijk[decomp_dim].first(); c <= ijk[decomp_dim].last(); ++c) // TODO: optimise for i.count() == 1 { auto slice_idx = ijk; - slice_idx.lbound(0) = c; - slice_idx.ubound(0) = c; + slice_idx.lbound(decomp_dim) = c; + slice_idx.ubound(decomp_dim) = c; if (sum_khn) (*sumtmp)(c) = blitz::kahan_sum(arr(slice_idx)); @@ -138,14 +142,14 @@ namespace libmpdataxx { // master thread calculates the sum from this process, stores in shared array if (sum_khn) - (*sumtmp)(grid_size[0].first())= blitz::kahan_sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! else - (*sumtmp)(grid_size[0].first())= blitz::sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! // master thread calculates sum of sums from all processes - (*sumtmp)(grid_size[0].first()) = this->distmem.sum((*sumtmp)(grid_size[0].first())); // inplace?! + (*sumtmp)(grid_size[decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[decomp_dim].first())); // inplace?! } barrier(); - double res = (*sumtmp)(grid_size[0].first()); // propagate the total sum to all threads of the process + double res = (*sumtmp)(grid_size[decomp_dim].first()); // propagate the total sum to all threads of the process barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread return res; #endif @@ -156,11 +160,11 @@ namespace libmpdataxx { // doing a two-step sum to reduce numerical error // and make parallel results reproducible - for (int c = ijk[0].first(); c <= ijk[0].last(); ++c) + for (int c = ijk[decomp_dim].first(); c <= ijk[decomp_dim].last(); ++c) { auto slice_idx = ijk; - slice_idx.lbound(0) = c; - slice_idx.ubound(0) = c; + slice_idx.lbound(decomp_dim) = c; + slice_idx.ubound(decomp_dim) = c; if (sum_khn) (*sumtmp)(c) = blitz::kahan_sum(arr1(slice_idx) * arr2(slice_idx)); @@ -182,14 +186,14 @@ namespace libmpdataxx { // master thread calculates the sum from this process, stores in shared array if (sum_khn) - (*sumtmp)(grid_size[0].first())= blitz::kahan_sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! else - (*sumtmp)(grid_size[0].first())= blitz::sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! // master thread calculates sum of sums from all processes - (*sumtmp)(grid_size[0].first()) = this->distmem.sum((*sumtmp)(grid_size[0].first())); // inplace?! + (*sumtmp)(grid_size[decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[decomp_dim].first())); // inplace?! } barrier(); - double res = (*sumtmp)(grid_size[0].first()); // propagate the total sum to all threads of the process + double res = (*sumtmp)(grid_size[decomp_dim].first()); // propagate the total sum to all threads of the process barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread return res; #endif @@ -207,12 +211,12 @@ namespace libmpdataxx #else if(rank == 0) { - (*xtmtmp)(0) = blitz::min(*xtmtmp); + (*xtmtmp)(decomp_dim) = blitz::min(*xtmtmp); // min across mpi processes - (*xtmtmp)(0) = this->distmem.min((*xtmtmp)(0)); + (*xtmtmp)(decomp_dim) = this->distmem.min((*xtmtmp)(decomp_dim)); } barrier(); - real_t res = (*xtmtmp)(0); // propagate the total min to all threads of the process + real_t res = (*xtmtmp)(decomp_dim); // propagate the total min to all threads of the process barrier(); // to avoid xtmtmp being overwritten by some other threads' next sum call return res; #endif @@ -231,12 +235,12 @@ namespace libmpdataxx #else if(rank == 0) { - (*xtmtmp)(0) = blitz::max(*xtmtmp); + (*xtmtmp)(decomp_dim) = blitz::max(*xtmtmp); // max across mpi processes - (*xtmtmp)(0) = this->distmem.max((*xtmtmp)(0)); + (*xtmtmp)(decomp_dim) = this->distmem.max((*xtmtmp)(decomp_dim)); } barrier(); - real_t res = (*xtmtmp)(0); // propagate the total max to all threads of the process + real_t res = (*xtmtmp)(decomp_dim); // propagate the total max to all threads of the process barrier(); // to avoid xtmtmp being overwritten by some other threads' next sum call return res; #endif From 6c21f1d49e2dcd136326f9c8c40c40d7650176e3 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Wed, 15 Jul 2020 11:41:56 +0200 Subject: [PATCH 02/44] sharedmem: decomposition in y cd --- libmpdata++/concurr/boost_thread.hpp | 2 +- libmpdata++/concurr/cxx11_thread.hpp | 2 +- libmpdata++/concurr/detail/concurr_common.hpp | 12 ++++++------ libmpdata++/concurr/openmp.hpp | 2 +- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/libmpdata++/concurr/boost_thread.hpp b/libmpdata++/concurr/boost_thread.hpp index 83675a7c..f1d7c7a6 100644 --- a/libmpdata++/concurr/boost_thread.hpp +++ b/libmpdata++/concurr/boost_thread.hpp @@ -81,7 +81,7 @@ namespace libmpdataxx // ctor boost_thread(const typename solver_t::rt_params_t &p) : - parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0])) + parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction {} }; diff --git a/libmpdata++/concurr/cxx11_thread.hpp b/libmpdata++/concurr/cxx11_thread.hpp index b89a556e..77dc47d9 100644 --- a/libmpdata++/concurr/cxx11_thread.hpp +++ b/libmpdata++/concurr/cxx11_thread.hpp @@ -119,7 +119,7 @@ namespace libmpdataxx // ctor cxx11_thread(const typename solver_t::rt_params_t &p) : - parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0])) + parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction {} }; diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp index 691e4ff0..3cd79783 100644 --- a/libmpdata++/concurr/detail/concurr_common.hpp +++ b/libmpdata++/concurr/detail/concurr_common.hpp @@ -216,11 +216,11 @@ namespace libmpdataxx } } - // 3D version + // 3D version, note sharedmem in y direction! void init( const typename solver_t::rt_params_t &p, const std::array &grid_size, - const int &n0, const int &n1 = 1, const int &n2 = 1 + const int &n1, const int &n0 = 1, const int &n2 = 1 ) { typename solver_t::bcp_t bxl, bxr, byl, byr, bzl, bzr, shrdl, shrdr; @@ -246,11 +246,11 @@ namespace libmpdataxx algos.push_back( new solver_t( typename solver_t::ctor_args_t({ - i0, + i1, mem.get(), - i0 == 0 ? bxl : shrdl, - i0 == n0 - 1 ? bxr : shrdr, - byl, byr, + bxl, bxr, + i1 == 0 ? byl : shrdl, + i1 == n1 - 1 ? byr : shrdr, bzl, bzr, mem->slab(grid_size[0], i0, n0), mem->slab(grid_size[1], i1, n1), diff --git a/libmpdata++/concurr/openmp.hpp b/libmpdata++/concurr/openmp.hpp index 24f7c78b..d33cda98 100644 --- a/libmpdata++/concurr/openmp.hpp +++ b/libmpdata++/concurr/openmp.hpp @@ -77,7 +77,7 @@ namespace libmpdataxx // ctor openmp(const typename solver_t::rt_params_t &p) : - parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0])) + parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction {} }; From 23a48853d8a5ca61560767fdb42a1e976b1ecde8 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Wed, 15 Jul 2020 11:54:00 +0200 Subject: [PATCH 03/44] sharedmem decomp in y: extend_range --- libmpdata++/solvers/detail/solver_3d.hpp | 42 ++++++++++++------------ 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index b7a3825a..9f5066f7 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -43,11 +43,11 @@ namespace libmpdataxx const bool deriv = false ) final // for a given array { - const auto range_ijk_0__ext = this->extend_range(range_ijk[0], ext); + const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); - for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk_0__ext, deriv); - for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk_0__ext, range_ijk[1]^ext, deriv); + for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); + for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); + for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); } void xchng(int e) final @@ -154,8 +154,8 @@ namespace libmpdataxx ) final { this->mem->barrier(); - const auto range_ijk_0__ext_h = this->extend_range(range_ijk[0], ext, h); - const auto range_ijk_0__ext_1 = this->extend_range(range_ijk[0], ext, 1); + const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h); + const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1); if (!cyclic) { for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); @@ -169,24 +169,24 @@ namespace libmpdataxx this->mem->barrier(); } - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk_0__ext_h, range_ijk[1]^ext^1); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_0__ext_1, range_ijk[1]^ext^h); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk_0__ext_1); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } else { - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk_0__ext_h); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk_0__ext_h, range_ijk[1]^ext^1); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_0__ext_1, range_ijk[1]^ext^h); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk_0__ext_1); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } this->mem->barrier(); } @@ -197,11 +197,11 @@ namespace libmpdataxx const int ext = 0 ) final { - const auto range_ijk_0__ext = this->extend_range(range_ijk[0], ext); + const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); - for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk_0__ext); - for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk_0__ext, range_ijk[1]^ext); + for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); + for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); + for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); } From 203bc4ccaf70b867835d9d032fc39151e9e89791 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Wed, 15 Jul 2020 11:57:01 +0200 Subject: [PATCH 04/44] sharedmem decomp in y: ijkm_sep --- .../solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp index be1161df..3223f21f 100644 --- a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp +++ b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp @@ -208,8 +208,16 @@ namespace libmpdataxx ijkm_sep = ijkm; if (this->rank > 0) { - ijkm_sep.lbound()(0) = this->ijk[0].first(); - ijkm_sep.ubound()(0) = this->ijk[0].last(); + if(ct_params_t::n_dims < 3) + { + ijkm_sep.lbound()(0) = this->ijk[0].first(); + ijkm_sep.ubound()(0) = this->ijk[0].last(); + } + else + { + ijkm_sep.lbound()(1) = this->ijk[1].first(); + ijkm_sep.ubound()(1) = this->ijk[1].last(); + } } } From 58fbb1a9fff291db449fd3144fea9c8c55519179 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Wed, 15 Jul 2020 19:44:05 +0200 Subject: [PATCH 05/44] sharedmem decomp in y: ijk_vec --- .../solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp index 3223f21f..4e92ad49 100644 --- a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp +++ b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp @@ -57,7 +57,7 @@ namespace libmpdataxx // ijkm with non-overlapping ranges idx_t ijkm_sep; - // like ijk, but with range in x direction extended by 1 to the left for rank=0 for MPI compliance. + // like ijk, but with range in x direction extended by 1 to the left for MPI compliance. // MPI requires that vector between two process domains is calculated by the process to the right of it (c.f. remote_2d.hpp fill_halos_vctr_alng) // TODO: change MPI logic to assume that it is calculated by the process to the left? then, ijk_vec would not be needed(?) std::array ijk_vec; @@ -202,7 +202,13 @@ namespace libmpdataxx ijk_vec[d] = rng_t(this->ijk[d].first(), this->ijk[d].last()); } - if (this->rank == 0) + + if(ct_params_t::n_dims < 3) // 1D and 2D - sharedmem in x direction + { + if (this->rank == 0) + ijk_vec[0] = rng_t(this->ijk[0].first() - 1, this->ijk[0].last()); + } + else // 3D - sharedmem in y direction ijk_vec[0] = rng_t(this->ijk[0].first() - 1, this->ijk[0].last()); ijkm_sep = ijkm; From 32bc27c426ebcdc2512eb0799aa2dc8db0afa0f5 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Thu, 1 Oct 2020 14:38:43 +0200 Subject: [PATCH 06/44] first try @ k,i,j storage order, 3D only for now --- libmpdata++/concurr/detail/sharedmem.hpp | 4 +- libmpdata++/solvers/detail/solver_3d.hpp | 49 +++++++++++++++++------- tests/sandbox/CMakeLists.txt | 18 ++++----- 3 files changed, 47 insertions(+), 24 deletions(-) diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index 6a1a2aad..cea99ca1 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -273,7 +273,9 @@ namespace libmpdataxx public: arr_t *never_delete(arr_t *arg) { - arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData); + blitz::GeneralArrayStorage<3> storage; + storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; + arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, storage); ret->reindexSelf(arg->base()); return ret; } diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 9f5066f7..2b8f15d3 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -339,6 +339,7 @@ namespace libmpdataxx this->set_bcs(2, args.bczl, args.bczr); } + public: static void alloc( @@ -346,6 +347,8 @@ namespace libmpdataxx const int &n_iters ) { + static blitz::GeneralArrayStorage<3> storage; + storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; // psi mem->psi.resize(parent_t::n_eqns); for (int e = 0; e < parent_t::n_eqns; ++e) // equations @@ -353,24 +356,28 @@ namespace libmpdataxx mem->psi[e].push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); // Courant field components (Arakawa-C grid) mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_vctr(mem->grid_size[2]) + parent_t::rng_vctr(mem->grid_size[2]), + storage ))); // fully third-order accurate mpdata needs also time derivatives of @@ -382,33 +389,39 @@ namespace libmpdataxx mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_vctr(mem->grid_size[2]) + parent_t::rng_vctr(mem->grid_size[2]), + storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_vctr(mem->grid_size[2]) + parent_t::rng_vctr(mem->grid_size[2]), + storage ))); } @@ -417,7 +430,8 @@ namespace libmpdataxx mem->G.reset(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); // allocate Kahan summation temporary vars @@ -426,7 +440,8 @@ namespace libmpdataxx mem->khn_tmp.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); // courant field alloc_tmp_sclr(mem, __FILE__, 1); @@ -441,6 +456,8 @@ namespace libmpdataxx bool srfc = false // allocate only surface data ) { + static blitz::GeneralArrayStorage<3> storage; + storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; mem->tmp[__file__].push_back(new arrvec_t()); for (int n = 0; n < n_arr; ++n) { @@ -449,7 +466,8 @@ namespace libmpdataxx stgr[n][1] ? parent_t::rng_vctr(mem->grid_size[1]) : parent_t::rng_sclr(mem->grid_size[1]), srfc ? rng_t(0, 0) : stgr[n][2] ? parent_t::rng_vctr(mem->grid_size[2]) : - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + storage ))); } } @@ -471,6 +489,8 @@ namespace libmpdataxx bool srfc = false // allocate only surface data ) { + static blitz::GeneralArrayStorage<3> storage; + storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; mem->tmp[__file__].push_back(new arrvec_t()); if (!name.empty()) mem->avail_tmp[name] = std::make_pair(__file__, mem->tmp[__file__].size() - 1); @@ -479,7 +499,8 @@ namespace libmpdataxx mem->tmp[__file__].back().push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - srfc ? rng_t(0, 0) : parent_t::rng_sclr(mem->grid_size[2]) + srfc ? rng_t(0, 0) : parent_t::rng_sclr(mem->grid_size[2]), + storage ))); } }; diff --git a/tests/sandbox/CMakeLists.txt b/tests/sandbox/CMakeLists.txt index f545441d..836461e2 100644 --- a/tests/sandbox/CMakeLists.txt +++ b/tests/sandbox/CMakeLists.txt @@ -50,13 +50,13 @@ endfunction() enable_testing() -add_subdirectory(mpi_adv) -add_subdirectory(straka) -add_subdirectory(tgv) -add_subdirectory(convergence_2d_3d) +#add_subdirectory(mpi_adv) +#add_subdirectory(straka) +#add_subdirectory(tgv) +#add_subdirectory(convergence_2d_3d) add_subdirectory(pbl) -add_subdirectory(convergence_spacetime) -add_subdirectory(bconds_div) -add_subdirectory(shear_layer) -add_subdirectory(convergence_vip_1d) -add_subdirectory(convergence_adv_diffusion) +#add_subdirectory(convergence_spacetime) +#add_subdirectory(bconds_div) +#add_subdirectory(shear_layer) +#add_subdirectory(convergence_vip_1d) +#add_subdirectory(convergence_adv_diffusion) From 27bd98fc01409d202d8356926b7b3e7f904786c9 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Thu, 1 Oct 2020 15:25:57 +0200 Subject: [PATCH 07/44] 3D array storage order defined in one place --- libmpdata++/blitz.hpp | 4 ++- libmpdata++/concurr/detail/sharedmem.hpp | 4 +-- libmpdata++/solvers/detail/solver_3d.hpp | 34 ++++++++++-------------- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/libmpdata++/blitz.hpp b/libmpdata++/blitz.hpp index 606b0996..0dc0135b 100644 --- a/libmpdata++/blitz.hpp +++ b/libmpdata++/blitz.hpp @@ -112,5 +112,7 @@ namespace libmpdataxx #endif } }; -} // namespace libmpdataxx + // 3D blitz++ array storage with the k,i,j order for improved performance of sharedmem decomposition along j + blitz::GeneralArrayStorage<3> arr3D_storage({blitz::thirdDim, blitz::firstDim, blitz::secondDim}, {true, true, true}); +} // namespace libmpdataxx diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index cea99ca1..84a19aac 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -273,9 +273,7 @@ namespace libmpdataxx public: arr_t *never_delete(arr_t *arg) { - blitz::GeneralArrayStorage<3> storage; - storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; - arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, storage); + arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, arr3D_storage); ret->reindexSelf(arg->base()); return ret; } diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 2b8f15d3..96649fd9 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -347,8 +347,6 @@ namespace libmpdataxx const int &n_iters ) { - static blitz::GeneralArrayStorage<3> storage; - storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; // psi mem->psi.resize(parent_t::n_eqns); for (int e = 0; e < parent_t::n_eqns; ++e) // equations @@ -357,7 +355,7 @@ namespace libmpdataxx parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); // Courant field components (Arakawa-C grid) @@ -365,19 +363,19 @@ namespace libmpdataxx parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_vctr(mem->grid_size[2]), - storage + arr3D_storage ))); // fully third-order accurate mpdata needs also time derivatives of @@ -390,38 +388,38 @@ namespace libmpdataxx parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_vctr(mem->grid_size[2]), - storage + arr3D_storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_vctr(mem->grid_size[2]), - storage + arr3D_storage ))); } @@ -431,7 +429,7 @@ namespace libmpdataxx parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); // allocate Kahan summation temporary vars @@ -441,7 +439,7 @@ namespace libmpdataxx parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); // courant field alloc_tmp_sclr(mem, __FILE__, 1); @@ -456,8 +454,6 @@ namespace libmpdataxx bool srfc = false // allocate only surface data ) { - static blitz::GeneralArrayStorage<3> storage; - storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; mem->tmp[__file__].push_back(new arrvec_t()); for (int n = 0; n < n_arr; ++n) { @@ -467,7 +463,7 @@ namespace libmpdataxx srfc ? rng_t(0, 0) : stgr[n][2] ? parent_t::rng_vctr(mem->grid_size[2]) : parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); } } @@ -489,8 +485,6 @@ namespace libmpdataxx bool srfc = false // allocate only surface data ) { - static blitz::GeneralArrayStorage<3> storage; - storage.ordering() = blitz::thirdDim, blitz::firstDim, blitz::secondDim; mem->tmp[__file__].push_back(new arrvec_t()); if (!name.empty()) mem->avail_tmp[name] = std::make_pair(__file__, mem->tmp[__file__].size() - 1); @@ -500,7 +494,7 @@ namespace libmpdataxx parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), srfc ? rng_t(0, 0) : parent_t::rng_sclr(mem->grid_size[2]), - storage + arr3D_storage ))); } }; From 41014dd33f09beb72403299e6de2da55b059adaf Mon Sep 17 00:00:00 2001 From: pdziekan Date: Thu, 1 Oct 2020 15:51:59 +0200 Subject: [PATCH 08/44] arr3D storae only in 3D sharedmem --- libmpdata++/concurr/detail/sharedmem.hpp | 16 ++++++++++++---- tests/sandbox/CMakeLists.txt | 18 +++++++++--------- 2 files changed, 21 insertions(+), 13 deletions(-) diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index 84a19aac..db365085 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -29,7 +29,6 @@ namespace libmpdataxx > class sharedmem_common { - using arr_t = blitz::Array; static_assert(n_dims > 0, "n_dims <= 0"); static_assert(n_tlev > 0, "n_tlev <= 0"); @@ -39,6 +38,7 @@ namespace libmpdataxx protected: + using arr_t = blitz::Array; blitz::TinyVector origin; public: @@ -271,9 +271,9 @@ namespace libmpdataxx boost::ptr_vector tobefreed; public: - arr_t *never_delete(arr_t *arg) + virtual arr_t *never_delete(arr_t *arg) { - arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, arr3D_storage); + arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData); ret->reindexSelf(arg->base()); return ret; } @@ -281,7 +281,7 @@ namespace libmpdataxx arr_t *old(arr_t *arg) { tobefreed.push_back(arg); - arr_t *ret = never_delete(arg); + arr_t *ret = this->never_delete(arg); return ret; } @@ -579,10 +579,18 @@ namespace libmpdataxx class sharedmem : public sharedmem_common { using parent_t = sharedmem_common; + using arr_t = typename parent_t::arr_t; using parent_t::parent_t; // inheriting ctors public: + virtual arr_t *never_delete(arr_t *arg) override + { + arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, arr3D_storage); + ret->reindexSelf(arg->base()); + return ret; + } + blitz::Array advectee(int e = 0) { assert(this->n < n_tlev); diff --git a/tests/sandbox/CMakeLists.txt b/tests/sandbox/CMakeLists.txt index 836461e2..f545441d 100644 --- a/tests/sandbox/CMakeLists.txt +++ b/tests/sandbox/CMakeLists.txt @@ -50,13 +50,13 @@ endfunction() enable_testing() -#add_subdirectory(mpi_adv) -#add_subdirectory(straka) -#add_subdirectory(tgv) -#add_subdirectory(convergence_2d_3d) +add_subdirectory(mpi_adv) +add_subdirectory(straka) +add_subdirectory(tgv) +add_subdirectory(convergence_2d_3d) add_subdirectory(pbl) -#add_subdirectory(convergence_spacetime) -#add_subdirectory(bconds_div) -#add_subdirectory(shear_layer) -#add_subdirectory(convergence_vip_1d) -#add_subdirectory(convergence_adv_diffusion) +add_subdirectory(convergence_spacetime) +add_subdirectory(bconds_div) +add_subdirectory(shear_layer) +add_subdirectory(convergence_vip_1d) +add_subdirectory(convergence_adv_diffusion) From f563bc15f4c7640e7bef63b592379d396968b963 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Tue, 6 Oct 2020 15:54:27 +0200 Subject: [PATCH 09/44] shmem_y mpi with each thread doing mpi calls --- libmpdata++/bcond/detail/bcond_common.hpp | 2 +- libmpdata++/bcond/detail/remote_common.hpp | 31 ++++++----- libmpdata++/concurr/detail/concurr_common.hpp | 25 +++++++-- libmpdata++/concurr/detail/sharedmem.hpp | 55 +++++++++---------- 4 files changed, 65 insertions(+), 48 deletions(-) diff --git a/libmpdata++/bcond/detail/bcond_common.hpp b/libmpdata++/bcond/detail/bcond_common.hpp index 1e9c6df2..6d7ecf4d 100644 --- a/libmpdata++/bcond/detail/bcond_common.hpp +++ b/libmpdata++/bcond/detail/bcond_common.hpp @@ -15,7 +15,7 @@ namespace libmpdataxx using namespace arakawa_c; enum bcond_e { null, cyclic, polar, open, rigid, remote, gndsky, custom }; - enum drctn_e { left, rght }; + enum drctn_e { left=0, rght=1 }; template< typename real_t, diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index 2d2cfba7..75f8c925 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -31,7 +31,8 @@ namespace libmpdataxx private: - const int grid_size_0; +// const int grid_size_0; + const int thread_rank; #if defined(USE_MPI) boost::mpi::communicator mpicom; @@ -40,10 +41,12 @@ namespace libmpdataxx # if defined(NDEBUG) static const int n_reqs = 2; // data, reqs for recv only is enough? - static const int n_dbg_reqs = 0; + static const int n_dbg_send_reqs = 0; + static const int n_dbg_tags = 0; # else static const int n_reqs = 4; // data + ranges - static const int n_dbg_reqs = 1; + static const int n_dbg_send_reqs = 1; + static const int n_dbg_tags = 2; # endif std::array reqs; @@ -53,7 +56,6 @@ namespace libmpdataxx : (mpicom.rank() + 1 ) % mpicom.size(); # if !defined(NDEBUG) - const int debug = 2; std::pair buf_rng; # endif #endif @@ -76,7 +78,7 @@ namespace libmpdataxx // distinguishing between left and right messages // (important e.g. with 2 procs and cyclic bc) const int - msg_send = dir == left ? left : rght; + msg_send = (2 + n_dbg_tags) * thread_rank + (dir == left ? left : rght); // (n_tags_per_thread) * thread_rank to distinguish between messages sent by different threads // arr_send references part of the send buffer that will be used arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData); @@ -92,7 +94,7 @@ namespace libmpdataxx // sending debug information # if !defined(NDEBUG) - reqs[1] = mpicom.isend(peer, msg_send ^ debug, std::pair( + reqs[1] = mpicom.isend(peer, msg_send + n_dbg_tags, std::pair( idx_send[0].first(), idx_send[0].last() )); @@ -110,17 +112,16 @@ namespace libmpdataxx { #if defined(USE_MPI) const int - msg_recv = dir == left ? rght : left; - + msg_recv = (2 + n_dbg_tags) * thread_rank + (dir == left ? rght : left); // launching async data transfer if(a(idx_recv).size()!=0) // TODO: test directly size of idx_recv { - reqs[1+n_dbg_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size()); + reqs[1+n_dbg_send_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size()); // sending debug information # if !defined(NDEBUG) - reqs[3] = mpicom.irecv(peer, msg_recv ^ debug, buf_rng); + reqs[3] = mpicom.irecv(peer, msg_recv + n_dbg_tags, buf_rng); # endif } #else @@ -137,7 +138,7 @@ namespace libmpdataxx send_hlpr(a, idx_send); // waiting for the transfers to finish - boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_reqs); // MPI_Waitall is thread-safe? + boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_send_reqs); // MPI_Waitall is thread-safe? #else assert(false); #endif @@ -153,7 +154,7 @@ namespace libmpdataxx recv_hlpr(a, idx_recv); // waiting for the transfers to finish - boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_reqs, reqs.end()); // MPI_Waitall is thread-safe? + boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_send_reqs, reqs.end()); // MPI_Waitall is thread-safe? // a blitz handler for the used part of the receive buffer arr_t arr_recv(buf_recv, a(idx_recv).shape(), blitz::neverDeleteData); // TODO: shape directly from idx_recv @@ -207,10 +208,12 @@ namespace libmpdataxx // ctor remote_common( const rng_t &i, - const std::array &grid_size + const std::array &grid_size, + const int thread_rank ) : parent_t(i, grid_size), - grid_size_0(grid_size[0]) +// grid_size_0(grid_size[0]), + thread_rank(thread_rank) { #if defined(USE_MPI) const int slice_size = n_dims==1 ? 1 : (n_dims==2? grid_size[1]+6 : (grid_size[1]+6) * (grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp index 3cd79783..ed8e3c6f 100644 --- a/libmpdata++/concurr/detail/concurr_common.hpp +++ b/libmpdata++/concurr/detail/concurr_common.hpp @@ -115,7 +115,8 @@ namespace libmpdataxx int dim > void bc_set( - typename solver_t::bcp_t &bcp + typename solver_t::bcp_t &bcp, + const int thread_rank = 0 // required only by remote (MPI) bcond ) { // sanity check - polar coords do not work with MPI yet @@ -123,7 +124,7 @@ namespace libmpdataxx throw std::runtime_error("Polar boundary conditions do not work with MPI."); // distmem overrides - if (type != bcond::remote && mem->distmem.size() > 1 && dim == 0) + if (mem->distmem.size() > 1 && dim == 0) { if ( // distmem domain interior @@ -133,7 +134,17 @@ namespace libmpdataxx // cyclic condition for distmem domain (note: will not work if a non-cyclic condition is on the other end) || (type == bcond::cyclic) - ) return bc_set(bcp); + ) + { + bcp.reset( + new bcond::bcond( + mem->slab(mem->grid_size[dim]), + mem->distmem.grid_size, + thread_rank + ) + ); + return; + } } // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?) @@ -153,6 +164,7 @@ namespace libmpdataxx { typename solver_t::bcp_t bxl, bxr, shrdl, shrdr; + // NOTE: for remote bcond, thread_rank set to 0 on purpose in 1D to have propre left/right message tags bc_set(bxl); bc_set(bxr); @@ -189,6 +201,7 @@ namespace libmpdataxx { typename solver_t::bcp_t bxl, bxr, byl, byr, shrdl, shrdr; + // NOTE: for remote bcond, thread_rank set to 0 on purpose in 2D to have propre left/right message tags bc_set(bxl); bc_set(bxr); @@ -231,8 +244,10 @@ namespace libmpdataxx { for (int i2 = 0; i2 < n2; ++i2) { - bc_set(bxl); - bc_set(bxr); + // i1 is the local thread rank, needed by remote bcond to distinguish mpi messages sent by different threads + // multiple threads send left/right mpi messages, because we use sharedmem domain decomposition along x axis + bc_set(bxl, i1); + bc_set(bxr, i1); bc_set(byl); bc_set(byr); diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index db365085..0a1115cd 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -48,9 +48,10 @@ namespace libmpdataxx std::array grid_size; bool panic = false; // for multi-threaded SIGTERM handling + // dimension in which sharedmem domain decomposition is done // 1D and 2D - domain decomposed in 0-th dimension (x) // 3D - domain decomposed in 1-st dimension (y) for better workload balance in MPI runs (MPI is decomposed in x) - const int decomp_dim; + const int shmem_decomp_dim; detail::distmem distmem; @@ -88,22 +89,20 @@ namespace libmpdataxx // ctors // TODO: fill reducetmp with NaNs (or use 1-element arrvec_t - it's NaN-filled by default) sharedmem_common(const std::array &grid_size, const int &size) - : n(0), distmem(grid_size), size(size), decomp_dim(n_dims < 3 ? 0 : 1) // TODO: is n(0) needed? + : n(0), distmem(grid_size), size(size), shmem_decomp_dim(n_dims < 3 ? 0 : 1) // TODO: is n(0) needed? { for (int d = 0; d < n_dims; ++d) { this->grid_size[d] = slab( rng_t(0, grid_size[d]-1), - d == decomp_dim ? distmem.rank() : 0, - d == decomp_dim ? distmem.size() : 1 + d == 0 ? distmem.rank() : 0, // decomposition along x, because that's MPI decomposition + d == 0 ? distmem.size() : 1 + // d == shmem_decomp_dim ? distmem.rank() : 0, + // d == shmem_decomp_dim ? distmem.size() : 1 ); origin[d] = this->grid_size[d].first(); } - std::ostringstream oss; - oss << "grid_size[0]: " << this->grid_size[0] << " origin[0]: " << origin[0] << std::endl; - std::cerr << oss.str() << std::endl; - if (size > grid_size[0]) throw std::runtime_error("number of subdomains greater than number of gridpoints"); @@ -117,11 +116,11 @@ namespace libmpdataxx { // doing a two-step sum to reduce numerical error // and make parallel results reproducible - for (int c = ijk[decomp_dim].first(); c <= ijk[decomp_dim].last(); ++c) // TODO: optimise for i.count() == 1 + for (int c = ijk[shmem_decomp_dim].first(); c <= ijk[shmem_decomp_dim].last(); ++c) // TODO: optimise for i.count() == 1 { auto slice_idx = ijk; - slice_idx.lbound(decomp_dim) = c; - slice_idx.ubound(decomp_dim) = c; + slice_idx.lbound(shmem_decomp_dim) = c; + slice_idx.ubound(shmem_decomp_dim) = c; if (sum_khn) (*sumtmp)(c) = blitz::kahan_sum(arr(slice_idx)); @@ -142,14 +141,14 @@ namespace libmpdataxx { // master thread calculates the sum from this process, stores in shared array if (sum_khn) - (*sumtmp)(grid_size[decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! else - (*sumtmp)(grid_size[decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! // master thread calculates sum of sums from all processes - (*sumtmp)(grid_size[decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[decomp_dim].first())); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[shmem_decomp_dim].first())); // inplace?! } barrier(); - double res = (*sumtmp)(grid_size[decomp_dim].first()); // propagate the total sum to all threads of the process + double res = (*sumtmp)(grid_size[shmem_decomp_dim].first()); // propagate the total sum to all threads of the process barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread return res; #endif @@ -160,11 +159,11 @@ namespace libmpdataxx { // doing a two-step sum to reduce numerical error // and make parallel results reproducible - for (int c = ijk[decomp_dim].first(); c <= ijk[decomp_dim].last(); ++c) + for (int c = ijk[shmem_decomp_dim].first(); c <= ijk[shmem_decomp_dim].last(); ++c) { auto slice_idx = ijk; - slice_idx.lbound(decomp_dim) = c; - slice_idx.ubound(decomp_dim) = c; + slice_idx.lbound(shmem_decomp_dim) = c; + slice_idx.ubound(shmem_decomp_dim) = c; if (sum_khn) (*sumtmp)(c) = blitz::kahan_sum(arr1(slice_idx) * arr2(slice_idx)); @@ -186,14 +185,14 @@ namespace libmpdataxx { // master thread calculates the sum from this process, stores in shared array if (sum_khn) - (*sumtmp)(grid_size[decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! else - (*sumtmp)(grid_size[decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! // master thread calculates sum of sums from all processes - (*sumtmp)(grid_size[decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[decomp_dim].first())); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[shmem_decomp_dim].first())); // inplace?! } barrier(); - double res = (*sumtmp)(grid_size[decomp_dim].first()); // propagate the total sum to all threads of the process + double res = (*sumtmp)(grid_size[shmem_decomp_dim].first()); // propagate the total sum to all threads of the process barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread return res; #endif @@ -211,12 +210,12 @@ namespace libmpdataxx #else if(rank == 0) { - (*xtmtmp)(decomp_dim) = blitz::min(*xtmtmp); + (*xtmtmp)(shmem_decomp_dim) = blitz::min(*xtmtmp); // min across mpi processes - (*xtmtmp)(decomp_dim) = this->distmem.min((*xtmtmp)(decomp_dim)); + (*xtmtmp)(shmem_decomp_dim) = this->distmem.min((*xtmtmp)(shmem_decomp_dim)); } barrier(); - real_t res = (*xtmtmp)(decomp_dim); // propagate the total min to all threads of the process + real_t res = (*xtmtmp)(shmem_decomp_dim); // propagate the total min to all threads of the process barrier(); // to avoid xtmtmp being overwritten by some other threads' next sum call return res; #endif @@ -235,12 +234,12 @@ namespace libmpdataxx #else if(rank == 0) { - (*xtmtmp)(decomp_dim) = blitz::max(*xtmtmp); + (*xtmtmp)(shmem_decomp_dim) = blitz::max(*xtmtmp); // max across mpi processes - (*xtmtmp)(decomp_dim) = this->distmem.max((*xtmtmp)(decomp_dim)); + (*xtmtmp)(shmem_decomp_dim) = this->distmem.max((*xtmtmp)(shmem_decomp_dim)); } barrier(); - real_t res = (*xtmtmp)(decomp_dim); // propagate the total max to all threads of the process + real_t res = (*xtmtmp)(shmem_decomp_dim); // propagate the total max to all threads of the process barrier(); // to avoid xtmtmp being overwritten by some other threads' next sum call return res; #endif From ddf70b5e35f07a7380abf8f68a08fd5063f6e88f Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Wed, 7 Oct 2020 20:31:55 +0200 Subject: [PATCH 10/44] remote 3d shmem y single thread wip --- libmpdata++/bcond/detail/polar_common.hpp | 6 +- libmpdata++/bcond/detail/remote_common.hpp | 16 ++--- libmpdata++/bcond/remote_3d.hpp | 10 +-- libmpdata++/concurr/detail/concurr_common.hpp | 62 ++++++++++++++++--- 4 files changed, 68 insertions(+), 26 deletions(-) diff --git a/libmpdata++/bcond/detail/polar_common.hpp b/libmpdata++/bcond/detail/polar_common.hpp index ebdb8326..193cf72a 100644 --- a/libmpdata++/bcond/detail/polar_common.hpp +++ b/libmpdata++/bcond/detail/polar_common.hpp @@ -37,10 +37,10 @@ namespace libmpdataxx // ctor polar_common( const rng_t &i, - const std::array &grid_size + const std::array &distmem_grid_size ) : - parent_t(i, grid_size), - pole((grid_size[0] - 1) / 2) + parent_t(i, distmem_grid_size), + pole((distmem_grid_size[0] - 1) / 2) {} }; } // namespace detail diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index 75f8c925..4b2da431 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -31,9 +31,6 @@ namespace libmpdataxx private: -// const int grid_size_0; - const int thread_rank; - #if defined(USE_MPI) boost::mpi::communicator mpicom; real_t *buf_send, @@ -78,7 +75,7 @@ namespace libmpdataxx // distinguishing between left and right messages // (important e.g. with 2 procs and cyclic bc) const int - msg_send = (2 + n_dbg_tags) * thread_rank + (dir == left ? left : rght); // (n_tags_per_thread) * thread_rank to distinguish between messages sent by different threads + msg_send = dir == left ? left : rght; // arr_send references part of the send buffer that will be used arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData); @@ -112,7 +109,7 @@ namespace libmpdataxx { #if defined(USE_MPI) const int - msg_recv = (2 + n_dbg_tags) * thread_rank + (dir == left ? rght : left); + msg_recv = dir == left ? rght : left; // launching async data transfer if(a(idx_recv).size()!=0) // TODO: test directly size of idx_recv @@ -208,15 +205,12 @@ namespace libmpdataxx // ctor remote_common( const rng_t &i, - const std::array &grid_size, - const int thread_rank + const std::array &distmem_grid_size ) : - parent_t(i, grid_size), -// grid_size_0(grid_size[0]), - thread_rank(thread_rank) + parent_t(i, distmem_grid_size) { #if defined(USE_MPI) - const int slice_size = n_dims==1 ? 1 : (n_dims==2? grid_size[1]+6 : (grid_size[1]+6) * (grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides + const int slice_size = n_dims==1 ? 1 : (n_dims==2? distmem_grid_size[1]+6 : (distmem_grid_size[1]+6) * (distmem_grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides // allocate enough memory in buffers to store largest halos to be sent buf_send = (real_t *) malloc(halo * slice_size * sizeof(real_t)); buf_recv = (real_t *) malloc(halo * slice_size * sizeof(real_t)); diff --git a/libmpdata++/bcond/remote_3d.hpp b/libmpdata++/bcond/remote_3d.hpp index 2ec442b1..ed401ad4 100644 --- a/libmpdata++/bcond/remote_3d.hpp +++ b/libmpdata++/bcond/remote_3d.hpp @@ -5,7 +5,7 @@ #pragma once -#include +#include namespace libmpdataxx { @@ -18,10 +18,10 @@ namespace libmpdataxx dir == left && n_dims == 3 >::type - > : public detail::remote_common + > : public detail::remote_3d_common { - using parent_t = detail::remote_common; + using parent_t = detail::remote_3d_common; using arr_t = blitz::Array; using parent_t::parent_t; // inheriting ctor @@ -131,9 +131,9 @@ namespace libmpdataxx dir == rght && n_dims == 3 >::type - > : public detail::remote_common + > : public detail::remote_3d_common { - using parent_t = detail::remote_common; + using parent_t = detail::remote_3d_common; using arr_t = blitz::Array; using parent_t::parent_t; // inheriting ctor diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp index ed8e3c6f..4fae0ef9 100644 --- a/libmpdata++/concurr/detail/concurr_common.hpp +++ b/libmpdata++/concurr/detail/concurr_common.hpp @@ -40,6 +40,53 @@ namespace libmpdataxx { namespace detail { + template < + class real_t, + class mem_t, + bcond::drctn_e dir, + int dim, + int n_dims, + int halo + > + void bc_set_remote( + typename solver_t::bcp_t &bcp, + const std::unique_ptr &mem; + const int thread_rank, + const int thread_count + ) + { + bcp.reset( + new bcond::bcond( + mem->slab(mem->grid_size[dim]), + mem->distmem.grid_size + ) + ); + } + + template < + class real_t, + class mem_t, + bcond::drctn_e dir, + int dim, + int halo + > + void bc_set_remote( + typename solver_t::bcp_t &bcp, + const std::unique_ptr &mem; + const int thread_rank, + const int thread_count + ) + { + bcp.reset( + new bcond::bcond( + mem->slab(mem->grid_size[dim]), + mem->distmem.grid_size, + mem->slab(mem->grid_size[1], thread_rank, thread_count), + thread_rank + ) + ); + } + template< class solver_t_, bcond::bcond_e bcxl, bcond::bcond_e bcxr, @@ -116,7 +163,8 @@ namespace libmpdataxx > void bc_set( typename solver_t::bcp_t &bcp, - const int thread_rank = 0 // required only by remote (MPI) bcond + const int thread_rank = 0, // required only by 3D remote (MPI) bcond + const int thread_count = 0 // required only by 3D remote (MPI) bcond ) { // sanity check - polar coords do not work with MPI yet @@ -136,10 +184,12 @@ namespace libmpdataxx (type == bcond::cyclic) ) { - bcp.reset( + // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?) + bc_set_remote( new bcond::bcond( mem->slab(mem->grid_size[dim]), mem->distmem.grid_size, + mem->slab(mem->grid_size[1], thread_rank, thread_count), thread_rank ) ); @@ -147,7 +197,6 @@ namespace libmpdataxx } } - // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?) bcp.reset( new bcond::bcond( mem->slab(mem->grid_size[dim]), @@ -244,10 +293,9 @@ namespace libmpdataxx { for (int i2 = 0; i2 < n2; ++i2) { - // i1 is the local thread rank, needed by remote bcond to distinguish mpi messages sent by different threads - // multiple threads send left/right mpi messages, because we use sharedmem domain decomposition along x axis - bc_set(bxl, i1); - bc_set(bxr, i1); + // i1 is the local thread rank, n1 is the number of threads. These are needed by remote bcond, because only rank=0 does mpi communication + bc_set(bxl, i1, n1); + bc_set(bxr, i1, n1); bc_set(byl); bc_set(byr); From 706d27fa351699decae0b5bd97023f9d5f4c3c97 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 8 Oct 2020 11:32:35 +0200 Subject: [PATCH 11/44] wip on bc set remote --- libmpdata++/concurr/detail/concurr_common.hpp | 85 ++++++++++++------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp index 4fae0ef9..27cd6ad5 100644 --- a/libmpdata++/concurr/detail/concurr_common.hpp +++ b/libmpdata++/concurr/detail/concurr_common.hpp @@ -42,49 +42,76 @@ namespace libmpdataxx { template < class real_t, - class mem_t, bcond::drctn_e dir, int dim, int n_dims, - int halo + int halo, + class bcp_t, + class mem_t > - void bc_set_remote( - typename solver_t::bcp_t &bcp, - const std::unique_ptr &mem; - const int thread_rank, - const int thread_count - ) + struct bc_set_remote_impl { - bcp.reset( - new bcond::bcond( - mem->slab(mem->grid_size[dim]), - mem->distmem.grid_size - ) - ); - } + static void _( + bcp_t &bcp, + const std::unique_ptr &mem, + const int thread_rank, + const int thread_count + ) + { + bcp.reset( + new bcond::bcond( + mem->slab(mem->grid_size[dim]), + mem->distmem.grid_size + ) + ); + } + }; template < class real_t, - class mem_t, bcond::drctn_e dir, int dim, - int halo + int halo, + class bcp_t, + class mem_t > - void bc_set_remote( - typename solver_t::bcp_t &bcp, - const std::unique_ptr &mem; + struct bc_set_remote_impl + { + static void _( + bcp_t &bcp, + const std::unique_ptr &mem, + const int thread_rank, + const int thread_count + ) + { + bcp.reset( + new bcond::bcond( + mem->slab(mem->grid_size[dim]), + mem->distmem.grid_size, + mem->slab(mem->grid_size[1], thread_rank, thread_count), + thread_rank + ) + ); + } + }; + + template < + class real_t, + bcond::drctn_e dir, + int dim, + int n_dims, + int halo, + class bcp_t, + class mem_t + > + void bc_set_remote( + bcp_t &bcp, + const std::unique_ptr &mem, const int thread_rank, const int thread_count ) { - bcp.reset( - new bcond::bcond( - mem->slab(mem->grid_size[dim]), - mem->distmem.grid_size, - mem->slab(mem->grid_size[1], thread_rank, thread_count), - thread_rank - ) - ); + bc_set_remote_impl::_(bcp, mem, thread_rank, thread_count); } template< @@ -185,7 +212,7 @@ namespace libmpdataxx ) { // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?) - bc_set_remote( + bc_set_remote( new bcond::bcond( mem->slab(mem->grid_size[dim]), mem->distmem.grid_size, From ef2d2aa853f45e778708db29945b2b15e85a9624 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 8 Oct 2020 11:36:54 +0200 Subject: [PATCH 12/44] add remote_3d_common --- libmpdata++/bcond/detail/remote_3d_common.hpp | 92 +++++++++++++++++++ 1 file changed, 92 insertions(+) create mode 100644 libmpdata++/bcond/detail/remote_3d_common.hpp diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp new file mode 100644 index 00000000..8515d767 --- /dev/null +++ b/libmpdata++/bcond/detail/remote_3d_common.hpp @@ -0,0 +1,92 @@ +// common code for ``remote'' MPI boundary conditions for libmpdata++ +// +// licensing: GPU GPL v3 +// copyright: University of Warsaw + +#pragma once + +#include + +namespace libmpdataxx +{ + namespace bcond + { + namespace detail + { + template + class remote_3d_common : public remote_common + { + using parent_t = detail::remote_common; + + using arr_t = blitz::Array; + using idx_t = blitz::RectDomain<3>; + + const int thread_rank; + const rng_t thread_j; + const int grid_size_y; + + public: + + void xchng( + const arr_t &a, + const idx_t &idx_send, + const idx_t &idx_recv + ) + { + if(thread_rank != 0) return; + + // domain to be sent but extended in y to the full sharedmem domain + idx_t idx_send_shmem(idx_send); + idx_t idx_recv_shmem(idx_recv); + + // try to guess what should be the whole domain exchanged by this process + // based on the difference between idx_send and thread_j + const auto j_rng_frst_diff = idx_send.lbound(1) - thread_j.lbound(0); + const auto j_rng_scnd_diff = idx_send.ubound(1) - thread_j.ubound(0); + idx_send_shmem.lbound(1) = 0 + j_rng_frst_diff; // does it have to start at 0? + idx_send_shmem.ubound(1) = grid_size_y + j_rng_scnd_diff; // does it have to end at grid_size_y? what about courants? + idx_recv_shmem.lbound(1) = 0 + j_rng_frst_diff; // does it have to start at 0? + idx_recv_shmem.ubound(1) = grid_size_y + j_rng_scnd_diff; // does it have to end at grid_size_y? what about courants? + + parent_t::xchng(a, idx_send, idx_recv); + } + + // ctor + remote_3d_common( + const rng_t &i, + const std::array &distmem_grid_size, + const rng_t thread_j, + const int thread_rank + ) : + parent_t(i, distmem_grid_size), + thread_rank(thread_rank), + thread_j(thread_j) + { +#if defined(USE_MPI) + // only thread 0 does mpi, others don't need buffers + if(thread_rank != 0) + { + free(parent_t::buf_send); + free(parent_t::buf_recv); + } +#endif + } + + // dtor + ~remote_3d_common() + { +#if defined(USE_MPI) + if(thread_rank == 0) + { + free(parent_t::buf_send); + free(parent_t::buf_recv); + } + // hack to make free in ~remote_common give defined behaviour + parent_t::buf_send = nullptr; + parent_t::buf_recv = nullptr; +#endif + } + }; + }; + } // namespace bcond +} // namespace libmpdataxx From 425ccfe5c943f1975f96e6b92a35b10cbb993548 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Thu, 8 Oct 2020 11:45:46 +0200 Subject: [PATCH 13/44] fix compilation of remote 3d shmem y --- libmpdata++/bcond/detail/remote_3d_common.hpp | 3 ++- libmpdata++/bcond/detail/remote_common.hpp | 5 ++-- libmpdata++/concurr/detail/concurr_common.hpp | 24 +++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp index 8515d767..b28560eb 100644 --- a/libmpdata++/bcond/detail/remote_3d_common.hpp +++ b/libmpdata++/bcond/detail/remote_3d_common.hpp @@ -60,7 +60,8 @@ namespace libmpdataxx ) : parent_t(i, distmem_grid_size), thread_rank(thread_rank), - thread_j(thread_j) + thread_j(thread_j), + grid_size_y(distmem_grid_size[1]) { #if defined(USE_MPI) // only thread 0 does mpi, others don't need buffers diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index 4b2da431..d0a0c57e 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -29,12 +29,13 @@ namespace libmpdataxx using arr_t = blitz::Array; using idx_t = blitz::RectDomain; + real_t *buf_send, + *buf_recv; + private: #if defined(USE_MPI) boost::mpi::communicator mpicom; - real_t *buf_send, - *buf_recv; # if defined(NDEBUG) static const int n_reqs = 2; // data, reqs for recv only is enough? diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp index 27cd6ad5..717b9bac 100644 --- a/libmpdata++/concurr/detail/concurr_common.hpp +++ b/libmpdata++/concurr/detail/concurr_common.hpp @@ -55,7 +55,7 @@ namespace libmpdataxx bcp_t &bcp, const std::unique_ptr &mem, const int thread_rank, - const int thread_count + const int thread_size ) { bcp.reset( @@ -81,14 +81,14 @@ namespace libmpdataxx bcp_t &bcp, const std::unique_ptr &mem, const int thread_rank, - const int thread_count + const int thread_size ) { bcp.reset( - new bcond::bcond( + new bcond::bcond( mem->slab(mem->grid_size[dim]), mem->distmem.grid_size, - mem->slab(mem->grid_size[1], thread_rank, thread_count), + mem->slab(mem->grid_size[1], thread_rank, thread_size), thread_rank ) ); @@ -108,10 +108,10 @@ namespace libmpdataxx bcp_t &bcp, const std::unique_ptr &mem, const int thread_rank, - const int thread_count + const int thread_size ) { - bc_set_remote_impl::_(bcp, mem, thread_rank, thread_count); + bc_set_remote_impl::_(bcp, mem, thread_rank, thread_size); } template< @@ -191,7 +191,7 @@ namespace libmpdataxx void bc_set( typename solver_t::bcp_t &bcp, const int thread_rank = 0, // required only by 3D remote (MPI) bcond - const int thread_count = 0 // required only by 3D remote (MPI) bcond + const int thread_size = 0 // required only by 3D remote (MPI) bcond ) { // sanity check - polar coords do not work with MPI yet @@ -213,12 +213,10 @@ namespace libmpdataxx { // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?) bc_set_remote( - new bcond::bcond( - mem->slab(mem->grid_size[dim]), - mem->distmem.grid_size, - mem->slab(mem->grid_size[1], thread_rank, thread_count), - thread_rank - ) + bcp, + mem, + thread_rank, + thread_size ); return; } From c7eea21d0d6e80ab678ba154d56b15a85f58ba19 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 8 Oct 2020 12:38:49 +0200 Subject: [PATCH 14/44] remote 3d fixes --- libmpdata++/bcond/detail/remote_3d_common.hpp | 43 ++++++++++++------- libmpdata++/bcond/detail/remote_common.hpp | 6 +-- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp index b28560eb..cedbabde 100644 --- a/libmpdata++/bcond/detail/remote_3d_common.hpp +++ b/libmpdata++/bcond/detail/remote_3d_common.hpp @@ -25,30 +25,43 @@ namespace libmpdataxx const rng_t thread_j; const int grid_size_y; + // try to guess what should be the whole domain exchanged by this process + // based on the difference between idx to be sent by this thread and idx of this process + idx_t extend_idx(idx_t idx) + { + idx.lbound(1) = 0 + idx.lbound(1) - thread_j.lbound(0); // does it have to start at 0? + idx.ubound(1) = grid_size_y + idx.ubound(1) - thread_j.ubound(0); // does it have to end at grid_size_y? what about courants? + return idx; + } + public: - void xchng( + void xchng ( const arr_t &a, const idx_t &idx_send, const idx_t &idx_recv - ) + ) override { if(thread_rank != 0) return; - - // domain to be sent but extended in y to the full sharedmem domain - idx_t idx_send_shmem(idx_send); - idx_t idx_recv_shmem(idx_recv); + parent_t::xchng(a, extend_idx(idx_send), extend_idx(idx_recv)); + } - // try to guess what should be the whole domain exchanged by this process - // based on the difference between idx_send and thread_j - const auto j_rng_frst_diff = idx_send.lbound(1) - thread_j.lbound(0); - const auto j_rng_scnd_diff = idx_send.ubound(1) - thread_j.ubound(0); - idx_send_shmem.lbound(1) = 0 + j_rng_frst_diff; // does it have to start at 0? - idx_send_shmem.ubound(1) = grid_size_y + j_rng_scnd_diff; // does it have to end at grid_size_y? what about courants? - idx_recv_shmem.lbound(1) = 0 + j_rng_frst_diff; // does it have to start at 0? - idx_recv_shmem.ubound(1) = grid_size_y + j_rng_scnd_diff; // does it have to end at grid_size_y? what about courants? + void send ( + const arr_t &a, + const idx_t &idx_send + ) override + { + if(thread_rank != 0) return; + parent_t::send(a, extend_idx(idx_send)); + } - parent_t::xchng(a, idx_send, idx_recv); + void recv ( + const arr_t &a, + const idx_t &idx_recv + ) override + { + if(thread_rank != 0) return; + parent_t::recv(a, extend_idx(idx_recv)); } // ctor diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index d0a0c57e..b1bcb2d3 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -127,7 +127,7 @@ namespace libmpdataxx #endif } - void send( + virtual void send( const arr_t &a, const idx_t &idx_send ) @@ -142,7 +142,7 @@ namespace libmpdataxx #endif } - void recv( + virtual void recv( const arr_t &a, const idx_t &idx_recv ) @@ -171,7 +171,7 @@ namespace libmpdataxx #endif } - void xchng( + virtual void xchng( const arr_t &a, const idx_t &idx_send, const idx_t &idx_recv From 8561a70ab43453cfce5dfe078343c8fdcfbfbd9d Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Mon, 12 Oct 2020 17:39:20 +0200 Subject: [PATCH 15/44] fix mpi min and max for omp 0 --- libmpdata++/concurr/detail/sharedmem.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index 0a1115cd..78cdfb46 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -210,12 +210,12 @@ namespace libmpdataxx #else if(rank == 0) { - (*xtmtmp)(shmem_decomp_dim) = blitz::min(*xtmtmp); + (*xtmtmp)(0) = blitz::min(*xtmtmp); // min across mpi processes - (*xtmtmp)(shmem_decomp_dim) = this->distmem.min((*xtmtmp)(shmem_decomp_dim)); + (*xtmtmp)(0) = this->distmem.min((*xtmtmp)(0)); } barrier(); - real_t res = (*xtmtmp)(shmem_decomp_dim); // propagate the total min to all threads of the process + real_t res = (*xtmtmp)(0); // propagate the total min to all threads of the process barrier(); // to avoid xtmtmp being overwritten by some other threads' next sum call return res; #endif @@ -234,12 +234,12 @@ namespace libmpdataxx #else if(rank == 0) { - (*xtmtmp)(shmem_decomp_dim) = blitz::max(*xtmtmp); + (*xtmtmp)(0) = blitz::max(*xtmtmp); // max across mpi processes - (*xtmtmp)(shmem_decomp_dim) = this->distmem.max((*xtmtmp)(shmem_decomp_dim)); + (*xtmtmp)(0) = this->distmem.max((*xtmtmp)(0)); } barrier(); - real_t res = (*xtmtmp)(shmem_decomp_dim); // propagate the total max to all threads of the process + real_t res = (*xtmtmp)(0); // propagate the total max to all threads of the process barrier(); // to avoid xtmtmp being overwritten by some other threads' next sum call return res; #endif From d695c84f70e7bf3366d90679a37ca0bdf9977e2e Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Tue, 13 Oct 2020 12:33:14 +0200 Subject: [PATCH 16/44] shmem_y mpi with 2 threads doing mpi; NOTE: messed up extend_range --- libmpdata++/bcond/detail/bcond_common.hpp | 2 +- libmpdata++/bcond/detail/remote_3d_common.hpp | 44 +++++---- libmpdata++/bcond/detail/remote_common.hpp | 28 +++++- libmpdata++/bcond/remote_3d.hpp | 97 +++++++++++++++---- libmpdata++/concurr/detail/concurr_common.hpp | 5 +- libmpdata++/solvers/detail/solver_3d.hpp | 12 ++- libmpdata++/solvers/detail/solver_common.hpp | 2 +- 7 files changed, 144 insertions(+), 46 deletions(-) diff --git a/libmpdata++/bcond/detail/bcond_common.hpp b/libmpdata++/bcond/detail/bcond_common.hpp index 6d7ecf4d..1e9c6df2 100644 --- a/libmpdata++/bcond/detail/bcond_common.hpp +++ b/libmpdata++/bcond/detail/bcond_common.hpp @@ -15,7 +15,7 @@ namespace libmpdataxx using namespace arakawa_c; enum bcond_e { null, cyclic, polar, open, rigid, remote, gndsky, custom }; - enum drctn_e { left=0, rght=1 }; + enum drctn_e { left, rght }; template< typename real_t, diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp index cedbabde..9599191f 100644 --- a/libmpdata++/bcond/detail/remote_3d_common.hpp +++ b/libmpdata++/bcond/detail/remote_3d_common.hpp @@ -18,10 +18,15 @@ namespace libmpdataxx { using parent_t = detail::remote_common; - using arr_t = blitz::Array; - using idx_t = blitz::RectDomain<3>; + protected: + + using arr_t = typename parent_t::arr_t; + using idx_t = typename parent_t::idx_t; + + const int thread_rank, thread_size; + + private: - const int thread_rank; const rng_t thread_j; const int grid_size_y; @@ -29,8 +34,10 @@ namespace libmpdataxx // based on the difference between idx to be sent by this thread and idx of this process idx_t extend_idx(idx_t idx) { - idx.lbound(1) = 0 + idx.lbound(1) - thread_j.lbound(0); // does it have to start at 0? - idx.ubound(1) = grid_size_y + idx.ubound(1) - thread_j.ubound(0); // does it have to end at grid_size_y? what about courants? +std::cerr << "extend idx start idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; + idx.lbound(1) = 0 + idx.lbound(1) - thread_j.first(); // does it have to start at 0? + idx.ubound(1) = (grid_size_y - 1) + idx.ubound(1) - thread_j.last(); // does it have to end at grid_size_y - 1? +std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; return idx; } @@ -40,27 +47,24 @@ namespace libmpdataxx const arr_t &a, const idx_t &idx_send, const idx_t &idx_recv - ) override + ) { - if(thread_rank != 0) return; parent_t::xchng(a, extend_idx(idx_send), extend_idx(idx_recv)); } void send ( const arr_t &a, const idx_t &idx_send - ) override + ) { - if(thread_rank != 0) return; parent_t::send(a, extend_idx(idx_send)); } void recv ( const arr_t &a, const idx_t &idx_recv - ) override + ) { - if(thread_rank != 0) return; parent_t::recv(a, extend_idx(idx_recv)); } @@ -68,21 +72,27 @@ namespace libmpdataxx remote_3d_common( const rng_t &i, const std::array &distmem_grid_size, - const rng_t thread_j, - const int thread_rank + const rng_t _thread_j, + const int thread_rank, + const int thread_size ) : parent_t(i, distmem_grid_size), thread_rank(thread_rank), - thread_j(thread_j), + thread_size(thread_size), + thread_j(_thread_j), grid_size_y(distmem_grid_size[1]) { #if defined(USE_MPI) - // only thread 0 does mpi, others don't need buffers - if(thread_rank != 0) + // only 2 threads do mpi, others don't need buffers + if(thread_rank != 0 && thread_rank != thread_size-1) { free(parent_t::buf_send); free(parent_t::buf_recv); } +std::cerr << "remote_3d_common ctor thread_j: " << thread_j.lbound(0) << ", " << thread_j.ubound(0) << std::endl; +std::cerr << "remote_3d_common ctor _thread_j: " << _thread_j.lbound(0) << ", " << _thread_j.ubound(0) << std::endl; +std::cerr << "remote_3d_common ctor f-l thread_j: " << thread_j.first() << ", " << thread_j.last() << std::endl; +std::cerr << "remote_3d_common ctor f-l _thread_j: " << _thread_j.first() << ", " << _thread_j.last() << std::endl; #endif } @@ -90,7 +100,7 @@ namespace libmpdataxx ~remote_3d_common() { #if defined(USE_MPI) - if(thread_rank == 0) + if(thread_rank == 0 || thread_rank == thread_size-1) { free(parent_t::buf_send); free(parent_t::buf_recv); diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index b1bcb2d3..238a7e0a 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -78,6 +78,12 @@ namespace libmpdataxx const int msg_send = dir == left ? left : rght; + std::cerr << "send_hlpr idx dir " << dir << " : " + << " (" << idx_send.lbound(0) << ", " << idx_send.ubound(0) << ")" + << " (" << idx_send.lbound(1) << ", " << idx_send.ubound(1) << ")" + << " (" << idx_send.lbound(2) << ", " << idx_send.ubound(2) << ")" + << std::endl; + // arr_send references part of the send buffer that will be used arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData); // copying data to be sent @@ -112,6 +118,13 @@ namespace libmpdataxx const int msg_recv = dir == left ? rght : left; + std::cerr << "recv_hlpr idx dir " << dir << " : " + << " (" << idx_recv.lbound(0) << ", " << idx_recv.ubound(0) << ")" + << " (" << idx_recv.lbound(1) << ", " << idx_recv.ubound(1) << ")" + << " (" << idx_recv.lbound(2) << ", " << idx_recv.ubound(2) << ")" + << std::endl; + + // launching async data transfer if(a(idx_recv).size()!=0) // TODO: test directly size of idx_recv { @@ -127,7 +140,7 @@ namespace libmpdataxx #endif } - virtual void send( + void send( const arr_t &a, const idx_t &idx_send ) @@ -142,7 +155,7 @@ namespace libmpdataxx #endif } - virtual void recv( + void recv( const arr_t &a, const idx_t &idx_recv ) @@ -171,7 +184,7 @@ namespace libmpdataxx #endif } - virtual void xchng( + void xchng( const arr_t &a, const idx_t &idx_send, const idx_t &idx_recv @@ -211,7 +224,15 @@ namespace libmpdataxx parent_t(i, distmem_grid_size) { #if defined(USE_MPI) + const int slice_size = n_dims==1 ? 1 : (n_dims==2? distmem_grid_size[1]+6 : (distmem_grid_size[1]+6) * (distmem_grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides +std::cerr << "remote_common ctor, " + << " distmem_grid_size[0]: " << distmem_grid_size[0] + << " distmem_grid_size[1]: " << distmem_grid_size[1] + << " distmem_grid_size[2]: " << distmem_grid_size[2] + << " slice_size: " << slice_size + << " halo: " << halo + << std::endl; // allocate enough memory in buffers to store largest halos to be sent buf_send = (real_t *) malloc(halo * slice_size * sizeof(real_t)); buf_recv = (real_t *) malloc(halo * slice_size * sizeof(real_t)); @@ -230,3 +251,4 @@ namespace libmpdataxx } } // namespace bcond } // namespace libmpdataxx + diff --git a/libmpdata++/bcond/remote_3d.hpp b/libmpdata++/bcond/remote_3d.hpp index ed401ad4..acceb4bd 100644 --- a/libmpdata++/bcond/remote_3d.hpp +++ b/libmpdata++/bcond/remote_3d.hpp @@ -22,17 +22,46 @@ namespace libmpdataxx { using parent_t = detail::remote_3d_common; - using arr_t = blitz::Array; + using arr_t = typename parent_t::arr_t; + using idx_t = typename parent_t::idx_t; using parent_t::parent_t; // inheriting ctor const int off = this->is_cyclic ? 0 : -1; + void xchng ( + const arr_t &a, + const idx_t &idx_send, + const idx_t &idx_recv + ) + { + if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0 + parent_t::xchng(a, idx_send, idx_recv); + } + + void send ( + const arr_t &a, + const idx_t &idx_send + ) + { + if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0 + parent_t::send(a, idx_send); + } + + void recv ( + const arr_t &a, + const idx_t &idx_recv + ) + { + if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0 + parent_t::recv(a, idx_recv); + } + public: void fill_halos_sclr(arr_t &a, const rng_t &j, const rng_t &k, const bool deriv = false) { using namespace idxperm; - this->xchng(a, pi(this->left_intr_sclr + off, j, k), pi(this->left_halo_sclr, j, k)); + xchng(a, pi(this->left_intr_sclr + off, j, k), pi(this->left_halo_sclr, j, k)); } void fill_halos_pres(arr_t &a, const rng_t &j, const rng_t &k) @@ -52,12 +81,12 @@ namespace libmpdataxx { if(halo == 1) // see remote_2d - this->send(av[0], pi(this->left_intr_vctr + off, j, k)); // TODO: no need to receive? the vector in halo was calculated anyway? + send(av[0], pi(this->left_intr_vctr + off, j, k)); // TODO: no need to receive? the vector in halo was calculated anyway? else - this->xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); // ditto + xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); // ditto } else - this->xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); + xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); } void fill_halos_sgs_div(arr_t &a, const rng_t &j, const rng_t &k) @@ -73,12 +102,12 @@ namespace libmpdataxx { if(halo == 1) // see remote_2d - this->send(av[0 + offset], pi(this->left_intr_vctr + off, j, k)); + send(av[0 + offset], pi(this->left_intr_vctr + off, j, k)); else - this->xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); + xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); } else - this->xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); + xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); } void fill_halos_sgs_tnsr(arrvec_t &av, const arr_t &, const arr_t &, const rng_t &j, const rng_t &k, const real_t) @@ -109,7 +138,7 @@ namespace libmpdataxx using namespace idxperm; assert(halo>=1); - this->xchng(a, pi(this->left_edge_sclr, j, k), pi(this->left_halo_sclr.last(), j, k)); + xchng(a, pi(this->left_edge_sclr, j, k), pi(this->left_halo_sclr.last(), j, k)); } void avg_edge_and_halo1_sclr_cyclic(arr_t &a, const rng_t &j, const rng_t &k) @@ -134,17 +163,49 @@ namespace libmpdataxx > : public detail::remote_3d_common { using parent_t = detail::remote_3d_common; - using arr_t = blitz::Array; + using arr_t = typename parent_t::arr_t; + using idx_t = typename parent_t::idx_t; using parent_t::parent_t; // inheriting ctor const int off = this->is_cyclic ? 0 : 1; + void xchng ( + const arr_t &a, + const idx_t &idx_send, + const idx_t &idx_recv + ) + { + if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread + // if(this->thread_rank != 0) return; // temporary + parent_t::xchng(a, idx_send, idx_recv); + } + + void send ( + const arr_t &a, + const idx_t &idx_send + ) + { + if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread + // if(this->thread_rank != 0) return; // temporary + parent_t::send(a, idx_send); + } + + void recv ( + const arr_t &a, + const idx_t &idx_recv + ) + { + if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread + // if(this->thread_rank != 0) return; // temporary + parent_t::recv(a, idx_recv); + } + public: void fill_halos_sclr(arr_t &a, const rng_t &j, const rng_t &k, const bool deriv = false) { using namespace idxperm; - this->xchng(a, pi(this->rght_intr_sclr + off, j, k), pi(this->rght_halo_sclr, j, k)); + xchng(a, pi(this->rght_intr_sclr + off, j, k), pi(this->rght_halo_sclr, j, k)); } void fill_halos_pres(arr_t &a, const rng_t &j, const rng_t &k) @@ -163,12 +224,12 @@ namespace libmpdataxx if(!this->is_cyclic) { if(halo == 1) - this->recv(av[0], pi(this->rght_halo_vctr, j, k)); + recv(av[0], pi(this->rght_halo_vctr, j, k)); else - this->xchng(av[0], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); } else - this->xchng(av[0], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); } void fill_halos_sgs_div(arr_t &a, const rng_t &j, const rng_t &k) @@ -183,12 +244,12 @@ namespace libmpdataxx if(!this->is_cyclic) { if(halo == 1) - this->recv(av[0 + offset], pi(this->rght_halo_vctr, j, k)); + recv(av[0 + offset], pi(this->rght_halo_vctr, j, k)); else - this->xchng(av[0 + offset], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0 + offset], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); } else - this->xchng(av[0 + offset], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0 + offset], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); } void fill_halos_sgs_tnsr(arrvec_t &av, const arr_t &, const arr_t &, const rng_t &j, const rng_t &k, const real_t) @@ -219,7 +280,7 @@ namespace libmpdataxx using namespace idxperm; assert(halo>=1); - this->xchng(a, pi(this->rght_edge_sclr, j, k), pi(this->rght_halo_sclr.first(), j, k)); + xchng(a, pi(this->rght_edge_sclr, j, k), pi(this->rght_halo_sclr.first(), j, k)); } void avg_edge_and_halo1_sclr_cyclic(arr_t &a, const rng_t &j, const rng_t &k) diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp index 717b9bac..9cacedd2 100644 --- a/libmpdata++/concurr/detail/concurr_common.hpp +++ b/libmpdata++/concurr/detail/concurr_common.hpp @@ -88,8 +88,9 @@ namespace libmpdataxx new bcond::bcond( mem->slab(mem->grid_size[dim]), mem->distmem.grid_size, - mem->slab(mem->grid_size[1], thread_rank, thread_size), - thread_rank + mem->slab(mem->grid_size[1], thread_rank, thread_size), // NOTE: we assume here remote 3d bcond is only on the edges perpendicular to x + thread_rank, + thread_size ) ); } diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 96649fd9..68da9d31 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -43,7 +43,8 @@ namespace libmpdataxx const bool deriv = false ) final // for a given array { - const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); + //const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); + const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); @@ -154,8 +155,10 @@ namespace libmpdataxx ) final { this->mem->barrier(); - const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h); - const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1); + // const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h); + // const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1); + const auto range_ijk_1__ext_h = range_ijk[1]^h; + const auto range_ijk_1__ext_1 = range_ijk[1]^1; if (!cyclic) { for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); @@ -197,7 +200,8 @@ namespace libmpdataxx const int ext = 0 ) final { - const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); + // const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); + const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); diff --git a/libmpdata++/solvers/detail/solver_common.hpp b/libmpdata++/solvers/detail/solver_common.hpp index c70cb54e..93ad1143 100644 --- a/libmpdata++/solvers/detail/solver_common.hpp +++ b/libmpdata++/solvers/detail/solver_common.hpp @@ -133,7 +133,7 @@ namespace libmpdataxx scale(e, -ct_params_t::hint_scale(e)); } - // thread-aware range extension + // thread-aware range extension, messes range guessing in remote_3d bcond template rng_t extend_range(const rng_t &r, const n_t n) const { From 5247ae2e33ba29eda6d52e25995c5351f935e463 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Tue, 13 Oct 2020 13:42:25 +0200 Subject: [PATCH 17/44] bring back proper extend_range --- libmpdata++/solvers/detail/solver_3d.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 68da9d31..89464a64 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -43,8 +43,8 @@ namespace libmpdataxx const bool deriv = false ) final // for a given array { - //const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); - const auto range_ijk_1__ext = range_ijk[1]^ext; + const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); + // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); @@ -155,10 +155,10 @@ namespace libmpdataxx ) final { this->mem->barrier(); - // const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h); - // const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1); - const auto range_ijk_1__ext_h = range_ijk[1]^h; - const auto range_ijk_1__ext_1 = range_ijk[1]^1; + const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h); + const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1); + // const auto range_ijk_1__ext_h = range_ijk[1]^h; + // const auto range_ijk_1__ext_1 = range_ijk[1]^1; if (!cyclic) { for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); @@ -200,8 +200,8 @@ namespace libmpdataxx const int ext = 0 ) final { - // const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); - const auto range_ijk_1__ext = range_ijk[1]^ext; + const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); + // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); From f131e4134f29b40d334fe6e8737ed92c7e63a119 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Tue, 13 Oct 2020 15:14:08 +0200 Subject: [PATCH 18/44] disable shmem range extension for boundaries in x, fixes mpi (?) but messes up other type of bcond on this edge --- libmpdata++/solvers/detail/solver_3d.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 89464a64..831b1fb8 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -46,7 +46,7 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); + for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); @@ -174,10 +174,10 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } else @@ -185,10 +185,10 @@ namespace libmpdataxx for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } this->mem->barrier(); @@ -203,7 +203,7 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); + for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); From 2a2bb7fd51fd2c81ba3d2d7d0245ea8e41c1717a Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Wed, 14 Oct 2020 13:33:14 +0200 Subject: [PATCH 19/44] solver 3d: barrier after xchng_flux, 2d and 3d: barriers before set and save edge --- libmpdata++/solvers/detail/solver_2d.hpp | 2 ++ libmpdata++/solvers/detail/solver_3d.hpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/libmpdata++/solvers/detail/solver_2d.hpp b/libmpdata++/solvers/detail/solver_2d.hpp index 2a95b563..c556e5cc 100644 --- a/libmpdata++/solvers/detail/solver_2d.hpp +++ b/libmpdata++/solvers/detail/solver_2d.hpp @@ -170,6 +170,7 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], sign); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[0], sign); this->mem->barrier(); @@ -180,6 +181,7 @@ namespace libmpdataxx const idx_t<2> &range_ijk ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1]); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[0]); this->mem->barrier(); diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 831b1fb8..5350dac8 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -80,6 +80,7 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); + this->mem->barrier(); } virtual void xchng_sgs_div( @@ -215,6 +216,7 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); @@ -226,6 +228,7 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); From 998100bf675828ba72538cd6c20375fd6323fe6f Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Wed, 14 Oct 2020 15:40:11 +0200 Subject: [PATCH 20/44] comment mpi debug output --- libmpdata++/bcond/detail/remote_3d_common.hpp | 12 +++---- libmpdata++/bcond/detail/remote_common.hpp | 34 +++++++++---------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp index 9599191f..9db71f65 100644 --- a/libmpdata++/bcond/detail/remote_3d_common.hpp +++ b/libmpdata++/bcond/detail/remote_3d_common.hpp @@ -34,10 +34,10 @@ namespace libmpdataxx // based on the difference between idx to be sent by this thread and idx of this process idx_t extend_idx(idx_t idx) { -std::cerr << "extend idx start idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; +//std::cerr << "extend idx start idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; idx.lbound(1) = 0 + idx.lbound(1) - thread_j.first(); // does it have to start at 0? idx.ubound(1) = (grid_size_y - 1) + idx.ubound(1) - thread_j.last(); // does it have to end at grid_size_y - 1? -std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; +//std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; return idx; } @@ -89,10 +89,10 @@ std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) free(parent_t::buf_send); free(parent_t::buf_recv); } -std::cerr << "remote_3d_common ctor thread_j: " << thread_j.lbound(0) << ", " << thread_j.ubound(0) << std::endl; -std::cerr << "remote_3d_common ctor _thread_j: " << _thread_j.lbound(0) << ", " << _thread_j.ubound(0) << std::endl; -std::cerr << "remote_3d_common ctor f-l thread_j: " << thread_j.first() << ", " << thread_j.last() << std::endl; -std::cerr << "remote_3d_common ctor f-l _thread_j: " << _thread_j.first() << ", " << _thread_j.last() << std::endl; +//std::cerr << "remote_3d_common ctor thread_j: " << thread_j.lbound(0) << ", " << thread_j.ubound(0) << std::endl; +//std::cerr << "remote_3d_common ctor _thread_j: " << _thread_j.lbound(0) << ", " << _thread_j.ubound(0) << std::endl; +//std::cerr << "remote_3d_common ctor f-l thread_j: " << thread_j.first() << ", " << thread_j.last() << std::endl; +//std::cerr << "remote_3d_common ctor f-l _thread_j: " << _thread_j.first() << ", " << _thread_j.last() << std::endl; #endif } diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index 238a7e0a..c7dfffb9 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -78,11 +78,11 @@ namespace libmpdataxx const int msg_send = dir == left ? left : rght; - std::cerr << "send_hlpr idx dir " << dir << " : " - << " (" << idx_send.lbound(0) << ", " << idx_send.ubound(0) << ")" - << " (" << idx_send.lbound(1) << ", " << idx_send.ubound(1) << ")" - << " (" << idx_send.lbound(2) << ", " << idx_send.ubound(2) << ")" - << std::endl; +// std::cerr << "send_hlpr idx dir " << dir << " : " +// << " (" << idx_send.lbound(0) << ", " << idx_send.ubound(0) << ")" +// << " (" << idx_send.lbound(1) << ", " << idx_send.ubound(1) << ")" +// << " (" << idx_send.lbound(2) << ", " << idx_send.ubound(2) << ")" +// << std::endl; // arr_send references part of the send buffer that will be used arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData); @@ -118,11 +118,11 @@ namespace libmpdataxx const int msg_recv = dir == left ? rght : left; - std::cerr << "recv_hlpr idx dir " << dir << " : " - << " (" << idx_recv.lbound(0) << ", " << idx_recv.ubound(0) << ")" - << " (" << idx_recv.lbound(1) << ", " << idx_recv.ubound(1) << ")" - << " (" << idx_recv.lbound(2) << ", " << idx_recv.ubound(2) << ")" - << std::endl; +// std::cerr << "recv_hlpr idx dir " << dir << " : " +// << " (" << idx_recv.lbound(0) << ", " << idx_recv.ubound(0) << ")" +// << " (" << idx_recv.lbound(1) << ", " << idx_recv.ubound(1) << ")" +// << " (" << idx_recv.lbound(2) << ", " << idx_recv.ubound(2) << ")" +// << std::endl; // launching async data transfer @@ -226,13 +226,13 @@ namespace libmpdataxx #if defined(USE_MPI) const int slice_size = n_dims==1 ? 1 : (n_dims==2? distmem_grid_size[1]+6 : (distmem_grid_size[1]+6) * (distmem_grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides -std::cerr << "remote_common ctor, " - << " distmem_grid_size[0]: " << distmem_grid_size[0] - << " distmem_grid_size[1]: " << distmem_grid_size[1] - << " distmem_grid_size[2]: " << distmem_grid_size[2] - << " slice_size: " << slice_size - << " halo: " << halo - << std::endl; +//std::cerr << "remote_common ctor, " +// << " distmem_grid_size[0]: " << distmem_grid_size[0] +// << " distmem_grid_size[1]: " << distmem_grid_size[1] +// << " distmem_grid_size[2]: " << distmem_grid_size[2] +// << " slice_size: " << slice_size +// << " halo: " << halo +// << std::endl; // allocate enough memory in buffers to store largest halos to be sent buf_send = (real_t *) malloc(halo * slice_size * sizeof(real_t)); buf_recv = (real_t *) malloc(halo * slice_size * sizeof(real_t)); From 219ca407174e8207451e185a30de444bd82ae0b2 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 15 Oct 2020 11:15:53 +0200 Subject: [PATCH 21/44] solver_3d: barriers after bc[0] fill halos, before other halos --- libmpdata++/solvers/detail/solver_3d.hpp | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 5350dac8..3573c7a8 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -47,6 +47,7 @@ namespace libmpdataxx // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); @@ -62,12 +63,14 @@ namespace libmpdataxx if (!cyclic) { for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); } else { for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); } @@ -78,6 +81,7 @@ namespace libmpdataxx { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); this->mem->barrier(); @@ -102,6 +106,7 @@ namespace libmpdataxx { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); this->mem->barrier(); @@ -115,6 +120,7 @@ namespace libmpdataxx { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); this->mem->barrier(); @@ -131,7 +137,9 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) { bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); + this->mem->barrier(); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); + this->mem->barrier(); } for (auto &bc : this->bcs[1]) @@ -176,9 +184,11 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); + this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } else @@ -187,9 +197,11 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); + this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } this->mem->barrier(); @@ -205,6 +217,7 @@ namespace libmpdataxx // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); @@ -218,6 +231,7 @@ namespace libmpdataxx { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); this->mem->barrier(); @@ -230,6 +244,7 @@ namespace libmpdataxx { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); this->mem->barrier(); From a6e881a055aa162ea83c41f9450d6558b5a29c11 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 15 Oct 2020 12:27:54 +0200 Subject: [PATCH 22/44] xchng methods in solver 3d: do bcs in x last, hence decrease number of barriers because a barrier is needed after remote bcond --- libmpdata++/solvers/detail/solver_3d.hpp | 65 ++++++++++-------------- 1 file changed, 28 insertions(+), 37 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 3573c7a8..91c9b21f 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -45,11 +45,10 @@ namespace libmpdataxx { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); // const auto range_ijk_1__ext = range_ijk[1]^ext; - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); + for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); this->mem->barrier(); } void xchng(int e) final @@ -62,28 +61,25 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); } this->mem->barrier(); } virtual void xchng_flux(arrvec_t &arrvec) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); + for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); } @@ -104,11 +100,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -118,11 +113,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); } @@ -134,14 +128,6 @@ namespace libmpdataxx { // off-diagonal components of stress tensor are treated the same as a vector this->mem->barrier(); - for (auto &bc : this->bcs[0]) - { - bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); - this->mem->barrier(); - bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); - this->mem->barrier(); - } - for (auto &bc : this->bcs[1]) { bc->fill_halos_sgs_vctr(av, bv[0], range_ijk[2]^1, range_ijkm[0], 2); @@ -153,6 +139,13 @@ namespace libmpdataxx bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[0], range_ijk[1]^1, 2); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[0]^1, range_ijkm[1], 3); } + + for (auto &bc : this->bcs[0]) + { + bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); + this->mem->barrier(); + bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); + } this->mem->barrier(); } @@ -183,26 +176,26 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); + + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); } else { for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); + + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); } this->mem->barrier(); } @@ -215,11 +208,10 @@ namespace libmpdataxx { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); // const auto range_ijk_1__ext = range_ijk[1]^ext; - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); + for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); this->mem->barrier(); } @@ -229,11 +221,10 @@ namespace libmpdataxx const int &sign ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); + for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); } @@ -242,11 +233,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -255,14 +245,15 @@ namespace libmpdataxx ) final { this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); - for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); - for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); + + for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); this->mem->barrier(); } From 5fa48b3b809c7cf680d6f31d6752704cac54b7a9 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Thu, 15 Oct 2020 16:08:46 +0200 Subject: [PATCH 23/44] flag for single-threaded bcond and their special treatment --- libmpdata++/bcond/detail/bcond_common.hpp | 7 ++- libmpdata++/bcond/detail/remote_3d_common.hpp | 2 +- libmpdata++/bcond/detail/remote_common.hpp | 5 ++- libmpdata++/solvers/detail/solver_3d.hpp | 44 ++++++++++++------- 4 files changed, 38 insertions(+), 20 deletions(-) diff --git a/libmpdata++/bcond/detail/bcond_common.hpp b/libmpdata++/bcond/detail/bcond_common.hpp index 1e9c6df2..92cf4445 100644 --- a/libmpdata++/bcond/detail/bcond_common.hpp +++ b/libmpdata++/bcond/detail/bcond_common.hpp @@ -193,6 +193,8 @@ namespace libmpdataxx virtual void avg_edge_and_halo1_sclr_cyclic(arr_3d_t &, const rng_t &, const rng_t &) {}; + const bool single_threaded; + protected: // sclr int @@ -207,7 +209,7 @@ namespace libmpdataxx public: // ctor - bcond_common(const rng_t &i, const std::array &) : + bcond_common(const rng_t &i, const std::array &, bool single_threaded = false) : // sclr left_edge_sclr( i.first() @@ -247,7 +249,8 @@ namespace libmpdataxx rght_intr_vctr( (i^h^(-1)).last() - (halo - 1), (i^h^(-1)).last() - ) + ), + single_threaded(single_threaded) {} // the one for use in shared diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp index 9db71f65..6aaf0aa3 100644 --- a/libmpdata++/bcond/detail/remote_3d_common.hpp +++ b/libmpdata++/bcond/detail/remote_3d_common.hpp @@ -76,7 +76,7 @@ namespace libmpdataxx const int thread_rank, const int thread_size ) : - parent_t(i, distmem_grid_size), + parent_t(i, distmem_grid_size, true), // true indicating that this is a bcond done with a single thread thread_rank(thread_rank), thread_size(thread_size), thread_j(_thread_j), diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index c7dfffb9..034b33a5 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -219,9 +219,10 @@ namespace libmpdataxx // ctor remote_common( const rng_t &i, - const std::array &distmem_grid_size + const std::array &distmem_grid_size, + bool single_threaded = false ) : - parent_t(i, distmem_grid_size) + parent_t(i, distmem_grid_size, single_threaded) { #if defined(USE_MPI) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 91c9b21f..e94b1651 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -44,11 +44,11 @@ namespace libmpdataxx ) final // for a given array { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); - // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); - for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); + for (auto &bc : this->bcs[0]) + bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. this->mem->barrier(); } void xchng(int e) final @@ -143,7 +143,7 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) { bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); - this->mem->barrier(); + if(bc->single_threaded) this->mem->barrier(); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); } this->mem->barrier(); @@ -159,8 +159,6 @@ namespace libmpdataxx this->mem->barrier(); const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h); const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1); - // const auto range_ijk_1__ext_h = range_ijk[1]^h; - // const auto range_ijk_1__ext_1 = range_ijk[1]^1; if (!cyclic) { for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); @@ -180,9 +178,16 @@ namespace libmpdataxx for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + for (auto &bc : this->bcs[0]) + if(bc->single_threaded) + { + bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); + this->mem->barrier(); + } + else bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + + for (auto &bc : this->bcs[0]) + bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); } else { @@ -193,9 +198,15 @@ namespace libmpdataxx for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + for (auto &bc : this->bcs[0]) + if (bc->single_threaded) + { + bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); + this->mem->barrier(); + } + else bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + for (auto &bc : this->bcs[0]) + bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); } this->mem->barrier(); } @@ -207,11 +218,11 @@ namespace libmpdataxx ) final { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); - // const auto range_ijk_1__ext = range_ijk[1]^ext; this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); - for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); + for (auto &bc : this->bcs[0]) + bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); this->mem->barrier(); } @@ -251,8 +262,11 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); - this->mem->barrier(); + for (auto &bc : this->bcs[0]) + { + bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); + if(bc->single_threaded) this->mem->barrier(); + } for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); this->mem->barrier(); } From 03c97d0ef8a1fab3694dda7a1988ae9677191cfd Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 15 Oct 2020 17:34:05 +0200 Subject: [PATCH 24/44] make arr3D_storage const --- libmpdata++/blitz.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmpdata++/blitz.hpp b/libmpdata++/blitz.hpp index 0dc0135b..53273bde 100644 --- a/libmpdata++/blitz.hpp +++ b/libmpdata++/blitz.hpp @@ -114,5 +114,5 @@ namespace libmpdataxx }; // 3D blitz++ array storage with the k,i,j order for improved performance of sharedmem decomposition along j - blitz::GeneralArrayStorage<3> arr3D_storage({blitz::thirdDim, blitz::firstDim, blitz::secondDim}, {true, true, true}); + const blitz::GeneralArrayStorage<3> arr3D_storage({blitz::thirdDim, blitz::firstDim, blitz::secondDim}, {true, true, true}); } // namespace libmpdataxx From 8b0aa790ca70860035230f87a725f5ec5133a956 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Mon, 19 Oct 2020 15:17:12 +0200 Subject: [PATCH 25/44] record_aux_dsc taking into account non-standard storage order in 3D --- libmpdata++/output/hdf5.hpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/libmpdata++/output/hdf5.hpp b/libmpdata++/output/hdf5.hpp index 553cedef..89c8013c 100644 --- a/libmpdata++/output/hdf5.hpp +++ b/libmpdata++/output/hdf5.hpp @@ -273,8 +273,10 @@ namespace libmpdataxx } case 3: { - typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); - contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); // create a copy that is contiguous + // create a copy that is contiguous and has the C-style (kji) storage order as required by HDF5 + typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); + contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); + dset.write(contiguous_arr.data(), flttype_solver, H5::DataSpace(parent_t::n_dims, srfcshape.data()), space, dxpl_id); break; } @@ -304,7 +306,10 @@ namespace libmpdataxx } case 3: { - typename solver_t::arr_t contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]).copy(); // create a copy that is contiguous + // create a copy that is contiguous and has the C-style (kji) storage order as required by HDF5 + typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]); + contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]); + dset.write(contiguous_arr.data(), flttype_solver, H5::DataSpace(parent_t::n_dims, shape.data()), space, dxpl_id); break; } @@ -312,7 +317,7 @@ namespace libmpdataxx }; } - // data is assumed to be contiguous and in the same layout as hdf variable + // data is assumed to be contiguous and in the same layout as hdf variable and in the C-style storage order void record_aux_hlpr(const std::string &name, typename solver_t::real_t *data, H5::H5File hdf) { assert(this->rank == 0); @@ -329,6 +334,7 @@ namespace libmpdataxx aux.write(data, flttype_solver, H5::DataSpace(parent_t::n_dims, shape.data()), space, dxpl_id); } + // data is assumed to be contiguous and in the same layout as hdf variable and in the C-style storage order void record_aux(const std::string &name, typename solver_t::real_t *data) { record_aux_hlpr(name, data, *hdfp); From 9cb5f6a3e0b891704d285055a11f1c7463898ed8 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Mon, 2 Nov 2020 15:40:29 +0100 Subject: [PATCH 26/44] nenver_delete: deduced storage order from arg --- libmpdata++/concurr/detail/sharedmem.hpp | 4 ++-- .../solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp | 10 ++++------ 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index 78cdfb46..39ccb7c5 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -107,7 +107,7 @@ namespace libmpdataxx throw std::runtime_error("number of subdomains greater than number of gridpoints"); if (n_dims != 1) - sumtmp.reset(new blitz::Array(this->grid_size[n_dims-2])); + sumtmp.reset(new blitz::Array(this->grid_size[shmem_decomp_dim])); xtmtmp.reset(new blitz::Array(size)); } @@ -585,7 +585,7 @@ namespace libmpdataxx virtual arr_t *never_delete(arr_t *arg) override { - arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, arr3D_storage); + arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, blitz::GeneralArrayStorage<3>(arg->ordering(), {true, true, true})); ret->reindexSelf(arg->base()); return ret; } diff --git a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp index 4e92ad49..07d15f50 100644 --- a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp +++ b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp @@ -203,12 +203,10 @@ namespace libmpdataxx ijk_vec[d] = rng_t(this->ijk[d].first(), this->ijk[d].last()); } - if(ct_params_t::n_dims < 3) // 1D and 2D - sharedmem in x direction - { - if (this->rank == 0) - ijk_vec[0] = rng_t(this->ijk[0].first() - 1, this->ijk[0].last()); - } - else // 3D - sharedmem in y direction + if(ct_params_t::n_dims < 3 && this->rank == 0 // 1D and 2D - sharedmem in x direction + || + ct_params_t::n_dims == 3 // 3D - sharedmem in y direction + ) ijk_vec[0] = rng_t(this->ijk[0].first() - 1, this->ijk[0].last()); ijkm_sep = ijkm; From 9f719f81f79c6121640387e5ab1abd8fa7ec60a9 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 5 Nov 2020 13:18:57 +0100 Subject: [PATCH 27/44] working solver_3d remote: bcond0 first --- libmpdata++/solvers/detail/solver_3d.hpp | 80 ++++++++++++++---------- 1 file changed, 48 insertions(+), 32 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index e94b1651..071f6854 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -45,11 +45,12 @@ namespace libmpdataxx { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); - for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); + for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); + this->mem->barrier(); } void xchng(int e) final { @@ -61,25 +62,28 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); } this->mem->barrier(); } virtual void xchng_flux(arrvec_t &arrvec) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); - for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); } @@ -100,10 +104,11 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -113,10 +118,11 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); } @@ -128,6 +134,14 @@ namespace libmpdataxx { // off-diagonal components of stress tensor are treated the same as a vector this->mem->barrier(); + for (auto &bc : this->bcs[0]) + { + bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); + if(bc->single_threaded) this->mem->barrier(); + bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); + } + this->mem->barrier(); + for (auto &bc : this->bcs[1]) { bc->fill_halos_sgs_vctr(av, bv[0], range_ijk[2]^1, range_ijkm[0], 2); @@ -139,13 +153,6 @@ namespace libmpdataxx bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[0], range_ijk[1]^1, 2); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[0]^1, range_ijkm[1], 3); } - - for (auto &bc : this->bcs[0]) - { - bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); - if(bc->single_threaded) this->mem->barrier(); - bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); - } this->mem->barrier(); } @@ -174,10 +181,6 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); - for (auto &bc : this->bcs[0]) if(bc->single_threaded) { @@ -185,19 +188,21 @@ namespace libmpdataxx this->mem->barrier(); } else bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + this->mem->barrier(); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); + + } else { for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); - for (auto &bc : this->bcs[0]) if (bc->single_threaded) { @@ -205,8 +210,14 @@ namespace libmpdataxx this->mem->barrier(); } else bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + this->mem->barrier(); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); + } this->mem->barrier(); } @@ -219,11 +230,12 @@ namespace libmpdataxx { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); - for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); + for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); + this->mem->barrier(); } virtual void set_edges( @@ -232,10 +244,11 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); - for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); } @@ -244,10 +257,11 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -256,19 +270,21 @@ namespace libmpdataxx ) final { this->mem->barrier(); - for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); - for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); - - for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) { bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); if(bc->single_threaded) this->mem->barrier(); } + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); + for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); + + for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); + + this->mem->barrier(); } void hook_ante_loop(const typename parent_t::advance_arg_t nt) // TODO: this nt conflicts in fact with multiple-advance()-call logic! From 265938867ebf337b2f96bf1a541618f29d9a415a Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 5 Nov 2020 16:42:57 +0100 Subject: [PATCH 28/44] 3d fill halos: do bcond0 last to avoid barriers, but only if transverse ranges are within ijk, no extension --- libmpdata++/solvers/detail/solver_3d.hpp | 81 ++++++++++++++---------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 071f6854..d9732388 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -46,8 +46,13 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. - this->mem->barrier(); + if(bc->single_threaded) // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. + { + bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); + this->mem->barrier(); + } + else + bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); @@ -62,28 +67,25 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); } this->mem->barrier(); } virtual void xchng_flux(arrvec_t &arrvec) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); + for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); } @@ -104,11 +106,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -118,11 +119,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); } @@ -139,8 +139,8 @@ namespace libmpdataxx bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); if(bc->single_threaded) this->mem->barrier(); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); + if(bc->single_threaded) this->mem->barrier(); } - this->mem->barrier(); for (auto &bc : this->bcs[1]) { @@ -187,13 +187,18 @@ namespace libmpdataxx bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } - else bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); - this->mem->barrier(); + else + bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); - this->mem->barrier(); + if(bc->single_threaded) + { + bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + this->mem->barrier(); + } + else + bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); @@ -209,15 +214,19 @@ namespace libmpdataxx bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } - else bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); - this->mem->barrier(); + else + bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); - this->mem->barrier(); + if(bc->single_threaded) + { + bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + this->mem->barrier(); + } + else + bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); - } this->mem->barrier(); } @@ -231,8 +240,13 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); - this->mem->barrier(); + if(bc->single_threaded) + { + bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); + this->mem->barrier(); + } + else + bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); @@ -244,11 +258,10 @@ namespace libmpdataxx const int &sign ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); + for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); } @@ -257,11 +270,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -270,19 +282,18 @@ namespace libmpdataxx ) final { this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); + for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); + + for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) { bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); if(bc->single_threaded) this->mem->barrier(); } - this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); - this->mem->barrier(); - for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); - for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); - - for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); this->mem->barrier(); } From daad63f1c90af9854e87612e65bd03b6b9cc5bfa Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 6 Nov 2020 10:54:05 +0100 Subject: [PATCH 29/44] Revert "3d fill halos: do bcond0 last to avoid barriers, but only if transverse ranges are within ijk, no extension" This reverts commit 265938867ebf337b2f96bf1a541618f29d9a415a. --- libmpdata++/solvers/detail/solver_3d.hpp | 81 ++++++++++-------------- 1 file changed, 35 insertions(+), 46 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index d9732388..071f6854 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -46,13 +46,8 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); for (auto &bc : this->bcs[0]) - if(bc->single_threaded) // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. - { - bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); - this->mem->barrier(); - } - else - bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); + bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); @@ -67,25 +62,28 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); } this->mem->barrier(); } virtual void xchng_flux(arrvec_t &arrvec) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); - for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); } @@ -106,10 +104,11 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -119,10 +118,11 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); } @@ -139,8 +139,8 @@ namespace libmpdataxx bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); if(bc->single_threaded) this->mem->barrier(); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); - if(bc->single_threaded) this->mem->barrier(); } + this->mem->barrier(); for (auto &bc : this->bcs[1]) { @@ -187,18 +187,13 @@ namespace libmpdataxx bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } - else - bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + else bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) - if(bc->single_threaded) - { - bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - this->mem->barrier(); - } - else - bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); @@ -214,19 +209,15 @@ namespace libmpdataxx bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } - else - bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + else bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) - if(bc->single_threaded) - { - bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - this->mem->barrier(); - } - else - bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); + } this->mem->barrier(); } @@ -240,13 +231,8 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); for (auto &bc : this->bcs[0]) - if(bc->single_threaded) - { - bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); - this->mem->barrier(); - } - else - bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); + bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); + this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); @@ -258,10 +244,11 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); - for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); } @@ -270,10 +257,11 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -282,18 +270,19 @@ namespace libmpdataxx ) final { this->mem->barrier(); - for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); - for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); - - for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) { bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); if(bc->single_threaded) this->mem->barrier(); } + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); + this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); + for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); + + for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); this->mem->barrier(); } From 9b50d06924de42156cafcaf079df14daba6a9b32 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 6 Nov 2020 11:59:18 +0100 Subject: [PATCH 30/44] Revert "Revert "3d fill halos: do bcond0 last to avoid barriers, but only if transverse ranges are within ijk, no extension"" This reverts commit daad63f1c90af9854e87612e65bd03b6b9cc5bfa. --- libmpdata++/solvers/detail/solver_3d.hpp | 81 ++++++++++++++---------- 1 file changed, 46 insertions(+), 35 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 071f6854..d9732388 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -46,8 +46,13 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. - this->mem->barrier(); + if(bc->single_threaded) // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. + { + bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); + this->mem->barrier(); + } + else + bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); @@ -62,28 +67,25 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); - this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); } this->mem->barrier(); } virtual void xchng_flux(arrvec_t &arrvec) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); + for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); } @@ -104,11 +106,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -118,11 +119,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); } @@ -139,8 +139,8 @@ namespace libmpdataxx bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); if(bc->single_threaded) this->mem->barrier(); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); + if(bc->single_threaded) this->mem->barrier(); } - this->mem->barrier(); for (auto &bc : this->bcs[1]) { @@ -187,13 +187,18 @@ namespace libmpdataxx bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } - else bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); - this->mem->barrier(); + else + bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); - this->mem->barrier(); + if(bc->single_threaded) + { + bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + this->mem->barrier(); + } + else + bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); @@ -209,15 +214,19 @@ namespace libmpdataxx bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } - else bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); - this->mem->barrier(); + else + bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); - this->mem->barrier(); + if(bc->single_threaded) + { + bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); + this->mem->barrier(); + } + else + bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); - } this->mem->barrier(); } @@ -231,8 +240,13 @@ namespace libmpdataxx const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); for (auto &bc : this->bcs[0]) - bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); - this->mem->barrier(); + if(bc->single_threaded) + { + bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); + this->mem->barrier(); + } + else + bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); @@ -244,11 +258,10 @@ namespace libmpdataxx const int &sign ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); + for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); } @@ -257,11 +270,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -270,19 +282,18 @@ namespace libmpdataxx ) final { this->mem->barrier(); + for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); + for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); + + for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); + for (auto &bc : this->bcs[0]) { bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); if(bc->single_threaded) this->mem->barrier(); } - this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); - this->mem->barrier(); - for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); - for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); - - for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); this->mem->barrier(); } From 6e1e05dc1d244260b550c728ed7c797ca3d0e39b Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 6 Nov 2020 12:43:28 +0100 Subject: [PATCH 31/44] solver 3d fill halos: add barrier before single-threaded bcond[0] if range is extended, because we need to make sure that other threads finished filling other extended bconds --- libmpdata++/solvers/detail/solver_3d.hpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index d9732388..c9ac00cc 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -97,6 +97,7 @@ namespace libmpdataxx this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_div(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_div(arr, range_ijk[2]^h, range_ijk[0]); + if(bc->single_threaded) this->mem->barrier(); // to make sure that all threads have filled range_ijk[2]^h before it is used to fill along x for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_div(arr, range_ijk[1], range_ijk[2]^h); this->mem->barrier(); } @@ -184,6 +185,7 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) if(bc->single_threaded) { + this->mem->barrier(); bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } @@ -194,14 +196,13 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) if(bc->single_threaded) { + this->mem->barrier(); bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); this->mem->barrier(); } else bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); - - } else { @@ -211,6 +212,7 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) if (bc->single_threaded) { + this->mem->barrier(); bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); this->mem->barrier(); } @@ -221,6 +223,7 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) if(bc->single_threaded) { + this->mem->barrier(); bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); this->mem->barrier(); } From 004afab5737fdd17d3e3cb5f2d31d785e61048e9 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 6 Nov 2020 13:24:00 +0100 Subject: [PATCH 32/44] solver 3d fill halos: barriers after both left and right remote bconds done, not between --- libmpdata++/solvers/detail/solver_3d.hpp | 90 ++++++++---------------- 1 file changed, 30 insertions(+), 60 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index c9ac00cc..108329d3 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -26,6 +26,11 @@ namespace libmpdataxx { using parent_t = solver_common; + bool barrier_if_single_threaded_bc0() + { + return this->bcs[0].at(0)->single_threaded || this->bcs[0].at(1)->single_threaded; + } + public: using real_t = typename ct_params_t::real_t; @@ -45,14 +50,8 @@ namespace libmpdataxx { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[0]) - if(bc->single_threaded) // single-threaded bcond (currently only remote_3d) don't like extend_range and need more barriers. - { - bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); - this->mem->barrier(); - } - else - bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); @@ -97,7 +96,7 @@ namespace libmpdataxx this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_div(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_div(arr, range_ijk[2]^h, range_ijk[0]); - if(bc->single_threaded) this->mem->barrier(); // to make sure that all threads have filled range_ijk[2]^h before it is used to fill along x + barrier_if_single_threaded_bc0(); // to make sure that all threads have filled range_ijk[2]^h before it is used to fill along x for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_div(arr, range_ijk[1], range_ijk[2]^h); this->mem->barrier(); } @@ -138,10 +137,9 @@ namespace libmpdataxx for (auto &bc : this->bcs[0]) { bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); - if(bc->single_threaded) this->mem->barrier(); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); - if(bc->single_threaded) this->mem->barrier(); } + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) { @@ -182,26 +180,16 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) - if(bc->single_threaded) - { - this->mem->barrier(); - bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - this->mem->barrier(); - } - else - bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1) : bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) - if(bc->single_threaded) - { - this->mem->barrier(); - bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - this->mem->barrier(); - } - else - bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } else @@ -209,26 +197,16 @@ namespace libmpdataxx for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); - for (auto &bc : this->bcs[0]) - if (bc->single_threaded) - { - this->mem->barrier(); - bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - this->mem->barrier(); - } - else - bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1) : bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) - if(bc->single_threaded) - { - this->mem->barrier(); - bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - this->mem->barrier(); - } - else - bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } this->mem->barrier(); @@ -242,14 +220,9 @@ namespace libmpdataxx { const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[0]) - if(bc->single_threaded) - { - bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); - this->mem->barrier(); - } - else - bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); @@ -291,11 +264,8 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) - { - bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); - if(bc->single_threaded) this->mem->barrier(); - } + for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); this->mem->barrier(); From 9156171fd091b6ce08bf948faf4f88a5347b9d26 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Mon, 9 Nov 2020 11:28:28 +0100 Subject: [PATCH 33/44] solver3d: fill halos order as in master, barrier after bc[0] it single threaded --- libmpdata++/solvers/detail/solver_3d.hpp | 34 ++++++++++++++---------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 108329d3..9b036767 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -66,15 +66,17 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); } this->mem->barrier(); } @@ -82,9 +84,10 @@ namespace libmpdataxx virtual void xchng_flux(arrvec_t &arrvec) final { this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); - for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); this->mem->barrier(); } @@ -107,9 +110,10 @@ namespace libmpdataxx ) final { this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -120,9 +124,10 @@ namespace libmpdataxx ) final { this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); - for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); this->mem->barrier(); } @@ -222,7 +227,6 @@ namespace libmpdataxx this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); barrier_if_single_threaded_bc0(); - for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); @@ -234,10 +238,10 @@ namespace libmpdataxx const int &sign ) final { - this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); - for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); this->mem->barrier(); } @@ -246,10 +250,10 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); this->mem->barrier(); } @@ -258,16 +262,18 @@ namespace libmpdataxx ) final { this->mem->barrier(); + + for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); - for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); - barrier_if_single_threaded_bc0(); - for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); - this->mem->barrier(); } From e19cdcdf49c3b1f08a6a4b6fa40eec01dc2f8736 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Mon, 9 Nov 2020 12:29:18 +0100 Subject: [PATCH 34/44] solver3d: fix barrier_if_single_threaded --- libmpdata++/solvers/detail/solver_3d.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 9b036767..e96f9455 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -26,9 +26,9 @@ namespace libmpdataxx { using parent_t = solver_common; - bool barrier_if_single_threaded_bc0() + void barrier_if_single_threaded_bc0() { - return this->bcs[0].at(0)->single_threaded || this->bcs[0].at(1)->single_threaded; + if(this->bcs[0].at(0)->single_threaded || this->bcs[0].at(1)->single_threaded) this->mem->barrier(); } public: From daa8159465a32e44cae49fc29d9579a888a666cf Mon Sep 17 00:00:00 2001 From: pdziekan Date: Mon, 9 Nov 2020 19:02:44 +0100 Subject: [PATCH 35/44] solver3d: add barrier at start of set_edge_pres and save_edge_vel --- libmpdata++/solvers/detail/solver_3d.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index e96f9455..8723b200 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -238,6 +238,7 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); @@ -250,6 +251,7 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); From 2b9a9fc0883fb2b6060e52911a6a9720cffd1811 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Tue, 10 Nov 2020 10:02:37 +0100 Subject: [PATCH 36/44] Revert "solver3d: add barrier at start of set_edge_pres and save_edge_vel" This reverts commit daa8159465a32e44cae49fc29d9579a888a666cf. --- libmpdata++/solvers/detail/solver_3d.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 8723b200..e96f9455 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -238,7 +238,6 @@ namespace libmpdataxx const int &sign ) final { - this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); @@ -251,7 +250,6 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { - this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); From 2a57b9f8a69c92c7cd2ae50c1b6e0167badc2437 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Tue, 10 Nov 2020 10:03:13 +0100 Subject: [PATCH 37/44] s2d: no barrier pre avg save edge --- libmpdata++/solvers/detail/solver_2d.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/libmpdata++/solvers/detail/solver_2d.hpp b/libmpdata++/solvers/detail/solver_2d.hpp index c556e5cc..2a95b563 100644 --- a/libmpdata++/solvers/detail/solver_2d.hpp +++ b/libmpdata++/solvers/detail/solver_2d.hpp @@ -170,7 +170,6 @@ namespace libmpdataxx const int &sign ) final { - this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], sign); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[0], sign); this->mem->barrier(); @@ -181,7 +180,6 @@ namespace libmpdataxx const idx_t<2> &range_ijk ) final { - this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1]); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[0]); this->mem->barrier(); From 1a950ca958e4840be7b2b20b6542aa9ccccd9fd8 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 13 Nov 2020 19:00:39 +0100 Subject: [PATCH 38/44] fix open_3d fill_vctr_alng for new (kij) shmem decomposition --- libmpdata++/bcond/open_3d.hpp | 12 ++++++------ libmpdata++/solvers/detail/solver_3d.hpp | 6 ++++-- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/libmpdata++/bcond/open_3d.hpp b/libmpdata++/bcond/open_3d.hpp index b621748a..da499cfb 100644 --- a/libmpdata++/bcond/open_3d.hpp +++ b/libmpdata++/bcond/open_3d.hpp @@ -78,15 +78,15 @@ namespace libmpdataxx // TODO: exactly the same code below! switch (d) // note: order and lack of breaks intentional! { - case 0: + case 1: av[d+2](pi(i, j, (k-h).first())) = 0; av[d+2](pi(i, j, (k+h).last() )) = 0; - case 1: + case 2: av[d+1](pi(i, (j-h).first(), k)) = 0; av[d+1](pi(i, (j+h).last(), k)) = 0; - case 2: + case 0: break; default: assert(false); @@ -190,15 +190,15 @@ namespace libmpdataxx switch (d) // note: order and lack of breaks intentional! { - case 0: + case 1: av[d+2](pi(i, j, (k-h).first())) = 0; av[d+2](pi(i, j, (k+h).last() )) = 0; - case 1: + case 2: av[d+1](pi(i, (j-h).first(), k)) = 0; av[d+1](pi(i, (j+h).last(), k)) = 0; - case 2: + case 0: break; default: assert(false); diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index e96f9455..6b396606 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -66,10 +66,11 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); - barrier_if_single_threaded_bc0(); + // order matters! see open_3d bcond for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { @@ -79,6 +80,7 @@ namespace libmpdataxx for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); } this->mem->barrier(); + } virtual void xchng_flux(arrvec_t &arrvec) final From ac82a7345c34541821681bfb731e5eb944487ccb Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 13 Nov 2020 19:01:42 +0100 Subject: [PATCH 39/44] add barrier at start of set_edges --- libmpdata++/solvers/detail/solver_2d.hpp | 2 ++ libmpdata++/solvers/detail/solver_3d.hpp | 2 ++ 2 files changed, 4 insertions(+) diff --git a/libmpdata++/solvers/detail/solver_2d.hpp b/libmpdata++/solvers/detail/solver_2d.hpp index 2a95b563..c556e5cc 100644 --- a/libmpdata++/solvers/detail/solver_2d.hpp +++ b/libmpdata++/solvers/detail/solver_2d.hpp @@ -170,6 +170,7 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], sign); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[0], sign); this->mem->barrier(); @@ -180,6 +181,7 @@ namespace libmpdataxx const idx_t<2> &range_ijk ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1]); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[0]); this->mem->barrier(); diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index 6b396606..e2b0a3be 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -240,6 +240,7 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); @@ -252,6 +253,7 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); From 13dec3c3c6db7861e7c4e73f1b6ba36d417d849e Mon Sep 17 00:00:00 2001 From: pdziekan Date: Fri, 13 Nov 2020 19:02:11 +0100 Subject: [PATCH 40/44] solvers: non-overlapping jm range for sharedmem 3d --- libmpdata++/solvers/detail/mpdata_osc_3d.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmpdata++/solvers/detail/mpdata_osc_3d.hpp b/libmpdata++/solvers/detail/mpdata_osc_3d.hpp index 8c7901e8..a2eab480 100644 --- a/libmpdata++/solvers/detail/mpdata_osc_3d.hpp +++ b/libmpdata++/solvers/detail/mpdata_osc_3d.hpp @@ -254,7 +254,7 @@ namespace libmpdataxx ) : parent_t(args, p), im(args.i.first() - 1, args.i.last()), - jm(args.j.first() - 1, args.j.last()), + jm(this->rank == 0 ? args.j.first() - 1 : args.j.first(), args.j.last()), km(args.k.first() - 1, args.k.last()) { } }; From 38c741653b864ab06d9e5732fce6c62e07a76fb5 Mon Sep 17 00:00:00 2001 From: pdziekan Date: Thu, 26 Nov 2020 14:27:35 +0100 Subject: [PATCH 41/44] osc_2d: nonoverlapping im --- libmpdata++/solvers/detail/mpdata_osc_2d.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libmpdata++/solvers/detail/mpdata_osc_2d.hpp b/libmpdata++/solvers/detail/mpdata_osc_2d.hpp index f86e9927..70b53d60 100644 --- a/libmpdata++/solvers/detail/mpdata_osc_2d.hpp +++ b/libmpdata++/solvers/detail/mpdata_osc_2d.hpp @@ -230,7 +230,7 @@ namespace libmpdataxx const typename parent_t::rt_params_t &p ) : parent_t(args, p), - im(args.i.first() - 1, args.i.last()), + im(this->rank == 0 ? args.i.first() - 1 : args.i.first(), args.i.last()), jm(args.j.first() - 1, args.j.last()) { } }; From 03d7a8262b40aea38e2a4a3c9de2ce110bb542b2 Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 15 Apr 2021 15:09:32 +0200 Subject: [PATCH 42/44] replace C++ MPI calls with C MPI calls --- libmpdata++/concurr/detail/distmem.hpp | 4 ++-- libmpdata++/solvers/detail/solver_common.hpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libmpdata++/concurr/detail/distmem.hpp b/libmpdata++/concurr/detail/distmem.hpp index 0465b830..78dc4e78 100644 --- a/libmpdata++/concurr/detail/distmem.hpp +++ b/libmpdata++/concurr/detail/distmem.hpp @@ -120,10 +120,10 @@ namespace libmpdataxx // init mpi here, since distmem is constructed before hdf5 // will be finalized in slvr_common dtor, since distmem is destructed before hdf5; // boost::mpi::enviro being global var caused deadlocks when freeing memory - if(!MPI::Is_initialized()) + if(!MPI_Is_initialized()) { mpi_initialized_before = false; - MPI::Init_thread(MPI_THREAD_MULTIPLE); + MPI_Init_thread(MPI_THREAD_MULTIPLE); } if (boost::mpi::environment::thread_level() != boost::mpi::threading::multiple) { diff --git a/libmpdata++/solvers/detail/solver_common.hpp b/libmpdata++/solvers/detail/solver_common.hpp index c70cb54e..1bd507ba 100644 --- a/libmpdata++/solvers/detail/solver_common.hpp +++ b/libmpdata++/solvers/detail/solver_common.hpp @@ -255,7 +255,7 @@ namespace libmpdataxx // TODO: MPI standard requires that the same thread that called mpi_init // calls mpi_finalize, we don't ensure it if(!libmpdataxx::concurr::detail::mpi_initialized_before && rank==0) - MPI::Finalize(); + MPI_Finalize(); #endif #if !defined(NDEBUG) assert(hook_ante_step_called && "any overriding hook_ante_step() must call parent_t::hook_ante_step()"); From 98a43fe7c8e00fbb6dcaa7f49f4a5a3b5639ba3d Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 15 Apr 2021 15:30:39 +0200 Subject: [PATCH 43/44] fix mpi C bindings calls --- libmpdata++/concurr/detail/distmem.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libmpdata++/concurr/detail/distmem.hpp b/libmpdata++/concurr/detail/distmem.hpp index 78dc4e78..6cf60aea 100644 --- a/libmpdata++/concurr/detail/distmem.hpp +++ b/libmpdata++/concurr/detail/distmem.hpp @@ -120,10 +120,12 @@ namespace libmpdataxx // init mpi here, since distmem is constructed before hdf5 // will be finalized in slvr_common dtor, since distmem is destructed before hdf5; // boost::mpi::enviro being global var caused deadlocks when freeing memory - if(!MPI_Is_initialized()) + int mpi_initialized; + MPI_Initialized(&mpi_initialized); + if(!mpi_initialized) { mpi_initialized_before = false; - MPI_Init_thread(MPI_THREAD_MULTIPLE); + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, nullptr); } if (boost::mpi::environment::thread_level() != boost::mpi::threading::multiple) { From 29796e777a747d861f3a50f16fd23a3ab61725ba Mon Sep 17 00:00:00 2001 From: Piotr Dziekan Date: Thu, 15 Apr 2021 18:31:30 +0200 Subject: [PATCH 44/44] mpi init: not null location for provided thlvl --- libmpdata++/concurr/detail/distmem.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libmpdata++/concurr/detail/distmem.hpp b/libmpdata++/concurr/detail/distmem.hpp index 6cf60aea..c8d6e0b2 100644 --- a/libmpdata++/concurr/detail/distmem.hpp +++ b/libmpdata++/concurr/detail/distmem.hpp @@ -125,7 +125,8 @@ namespace libmpdataxx if(!mpi_initialized) { mpi_initialized_before = false; - MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, nullptr); + int th_lvl_provided; + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &th_lvl_provided); } if (boost::mpi::environment::thread_level() != boost::mpi::threading::multiple) {