diff --git a/libmpdata++/bcond/detail/bcond_common.hpp b/libmpdata++/bcond/detail/bcond_common.hpp index 1e9c6df2..92cf4445 100644 --- a/libmpdata++/bcond/detail/bcond_common.hpp +++ b/libmpdata++/bcond/detail/bcond_common.hpp @@ -193,6 +193,8 @@ namespace libmpdataxx virtual void avg_edge_and_halo1_sclr_cyclic(arr_3d_t &, const rng_t &, const rng_t &) {}; + const bool single_threaded; + protected: // sclr int @@ -207,7 +209,7 @@ namespace libmpdataxx public: // ctor - bcond_common(const rng_t &i, const std::array &) : + bcond_common(const rng_t &i, const std::array &, bool single_threaded = false) : // sclr left_edge_sclr( i.first() @@ -247,7 +249,8 @@ namespace libmpdataxx rght_intr_vctr( (i^h^(-1)).last() - (halo - 1), (i^h^(-1)).last() - ) + ), + single_threaded(single_threaded) {} // the one for use in shared diff --git a/libmpdata++/bcond/detail/polar_common.hpp b/libmpdata++/bcond/detail/polar_common.hpp index ebdb8326..193cf72a 100644 --- a/libmpdata++/bcond/detail/polar_common.hpp +++ b/libmpdata++/bcond/detail/polar_common.hpp @@ -37,10 +37,10 @@ namespace libmpdataxx // ctor polar_common( const rng_t &i, - const std::array &grid_size + const std::array &distmem_grid_size ) : - parent_t(i, grid_size), - pole((grid_size[0] - 1) / 2) + parent_t(i, distmem_grid_size), + pole((distmem_grid_size[0] - 1) / 2) {} }; } // namespace detail diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp new file mode 100644 index 00000000..6aaf0aa3 --- /dev/null +++ b/libmpdata++/bcond/detail/remote_3d_common.hpp @@ -0,0 +1,116 @@ +// common code for ``remote'' MPI boundary conditions for libmpdata++ +// +// licensing: GPU GPL v3 +// copyright: University of Warsaw + +#pragma once + +#include + +namespace libmpdataxx +{ + namespace bcond + { + namespace detail + { + template + class remote_3d_common : public remote_common + { + using parent_t = detail::remote_common; + + protected: + + using arr_t = typename parent_t::arr_t; + using idx_t = typename parent_t::idx_t; + + const int thread_rank, thread_size; + + private: + + const rng_t thread_j; + const int grid_size_y; + + // try to guess what should be the whole domain exchanged by this process + // based on the difference between idx to be sent by this thread and idx of this process + idx_t extend_idx(idx_t idx) + { +//std::cerr << "extend idx start idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; + idx.lbound(1) = 0 + idx.lbound(1) - thread_j.first(); // does it have to start at 0? + idx.ubound(1) = (grid_size_y - 1) + idx.ubound(1) - thread_j.last(); // does it have to end at grid_size_y - 1? +//std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl; + return idx; + } + + public: + + void xchng ( + const arr_t &a, + const idx_t &idx_send, + const idx_t &idx_recv + ) + { + parent_t::xchng(a, extend_idx(idx_send), extend_idx(idx_recv)); + } + + void send ( + const arr_t &a, + const idx_t &idx_send + ) + { + parent_t::send(a, extend_idx(idx_send)); + } + + void recv ( + const arr_t &a, + const idx_t &idx_recv + ) + { + parent_t::recv(a, extend_idx(idx_recv)); + } + + // ctor + remote_3d_common( + const rng_t &i, + const std::array &distmem_grid_size, + const rng_t _thread_j, + const int thread_rank, + const int thread_size + ) : + parent_t(i, distmem_grid_size, true), // true indicating that this is a bcond done with a single thread + thread_rank(thread_rank), + thread_size(thread_size), + thread_j(_thread_j), + grid_size_y(distmem_grid_size[1]) + { +#if defined(USE_MPI) + // only 2 threads do mpi, others don't need buffers + if(thread_rank != 0 && thread_rank != thread_size-1) + { + free(parent_t::buf_send); + free(parent_t::buf_recv); + } +//std::cerr << "remote_3d_common ctor thread_j: " << thread_j.lbound(0) << ", " << thread_j.ubound(0) << std::endl; +//std::cerr << "remote_3d_common ctor _thread_j: " << _thread_j.lbound(0) << ", " << _thread_j.ubound(0) << std::endl; +//std::cerr << "remote_3d_common ctor f-l thread_j: " << thread_j.first() << ", " << thread_j.last() << std::endl; +//std::cerr << "remote_3d_common ctor f-l _thread_j: " << _thread_j.first() << ", " << _thread_j.last() << std::endl; +#endif + } + + // dtor + ~remote_3d_common() + { +#if defined(USE_MPI) + if(thread_rank == 0 || thread_rank == thread_size-1) + { + free(parent_t::buf_send); + free(parent_t::buf_recv); + } + // hack to make free in ~remote_common give defined behaviour + parent_t::buf_send = nullptr; + parent_t::buf_recv = nullptr; +#endif + } + }; + }; + } // namespace bcond +} // namespace libmpdataxx diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp index 2d2cfba7..034b33a5 100644 --- a/libmpdata++/bcond/detail/remote_common.hpp +++ b/libmpdata++/bcond/detail/remote_common.hpp @@ -29,21 +29,22 @@ namespace libmpdataxx using arr_t = blitz::Array; using idx_t = blitz::RectDomain; - private: + real_t *buf_send, + *buf_recv; - const int grid_size_0; + private: #if defined(USE_MPI) boost::mpi::communicator mpicom; - real_t *buf_send, - *buf_recv; # if defined(NDEBUG) static const int n_reqs = 2; // data, reqs for recv only is enough? - static const int n_dbg_reqs = 0; + static const int n_dbg_send_reqs = 0; + static const int n_dbg_tags = 0; # else static const int n_reqs = 4; // data + ranges - static const int n_dbg_reqs = 1; + static const int n_dbg_send_reqs = 1; + static const int n_dbg_tags = 2; # endif std::array reqs; @@ -53,7 +54,6 @@ namespace libmpdataxx : (mpicom.rank() + 1 ) % mpicom.size(); # if !defined(NDEBUG) - const int debug = 2; std::pair buf_rng; # endif #endif @@ -78,6 +78,12 @@ namespace libmpdataxx const int msg_send = dir == left ? left : rght; +// std::cerr << "send_hlpr idx dir " << dir << " : " +// << " (" << idx_send.lbound(0) << ", " << idx_send.ubound(0) << ")" +// << " (" << idx_send.lbound(1) << ", " << idx_send.ubound(1) << ")" +// << " (" << idx_send.lbound(2) << ", " << idx_send.ubound(2) << ")" +// << std::endl; + // arr_send references part of the send buffer that will be used arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData); // copying data to be sent @@ -92,7 +98,7 @@ namespace libmpdataxx // sending debug information # if !defined(NDEBUG) - reqs[1] = mpicom.isend(peer, msg_send ^ debug, std::pair( + reqs[1] = mpicom.isend(peer, msg_send + n_dbg_tags, std::pair( idx_send[0].first(), idx_send[0].last() )); @@ -112,15 +118,21 @@ namespace libmpdataxx const int msg_recv = dir == left ? rght : left; +// std::cerr << "recv_hlpr idx dir " << dir << " : " +// << " (" << idx_recv.lbound(0) << ", " << idx_recv.ubound(0) << ")" +// << " (" << idx_recv.lbound(1) << ", " << idx_recv.ubound(1) << ")" +// << " (" << idx_recv.lbound(2) << ", " << idx_recv.ubound(2) << ")" +// << std::endl; + // launching async data transfer if(a(idx_recv).size()!=0) // TODO: test directly size of idx_recv { - reqs[1+n_dbg_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size()); + reqs[1+n_dbg_send_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size()); // sending debug information # if !defined(NDEBUG) - reqs[3] = mpicom.irecv(peer, msg_recv ^ debug, buf_rng); + reqs[3] = mpicom.irecv(peer, msg_recv + n_dbg_tags, buf_rng); # endif } #else @@ -137,7 +149,7 @@ namespace libmpdataxx send_hlpr(a, idx_send); // waiting for the transfers to finish - boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_reqs); // MPI_Waitall is thread-safe? + boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_send_reqs); // MPI_Waitall is thread-safe? #else assert(false); #endif @@ -153,7 +165,7 @@ namespace libmpdataxx recv_hlpr(a, idx_recv); // waiting for the transfers to finish - boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_reqs, reqs.end()); // MPI_Waitall is thread-safe? + boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_send_reqs, reqs.end()); // MPI_Waitall is thread-safe? // a blitz handler for the used part of the receive buffer arr_t arr_recv(buf_recv, a(idx_recv).shape(), blitz::neverDeleteData); // TODO: shape directly from idx_recv @@ -207,13 +219,21 @@ namespace libmpdataxx // ctor remote_common( const rng_t &i, - const std::array &grid_size + const std::array &distmem_grid_size, + bool single_threaded = false ) : - parent_t(i, grid_size), - grid_size_0(grid_size[0]) + parent_t(i, distmem_grid_size, single_threaded) { #if defined(USE_MPI) - const int slice_size = n_dims==1 ? 1 : (n_dims==2? grid_size[1]+6 : (grid_size[1]+6) * (grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides + + const int slice_size = n_dims==1 ? 1 : (n_dims==2? distmem_grid_size[1]+6 : (distmem_grid_size[1]+6) * (distmem_grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides +//std::cerr << "remote_common ctor, " +// << " distmem_grid_size[0]: " << distmem_grid_size[0] +// << " distmem_grid_size[1]: " << distmem_grid_size[1] +// << " distmem_grid_size[2]: " << distmem_grid_size[2] +// << " slice_size: " << slice_size +// << " halo: " << halo +// << std::endl; // allocate enough memory in buffers to store largest halos to be sent buf_send = (real_t *) malloc(halo * slice_size * sizeof(real_t)); buf_recv = (real_t *) malloc(halo * slice_size * sizeof(real_t)); @@ -232,3 +252,4 @@ namespace libmpdataxx } } // namespace bcond } // namespace libmpdataxx + diff --git a/libmpdata++/bcond/open_3d.hpp b/libmpdata++/bcond/open_3d.hpp index b621748a..da499cfb 100644 --- a/libmpdata++/bcond/open_3d.hpp +++ b/libmpdata++/bcond/open_3d.hpp @@ -78,15 +78,15 @@ namespace libmpdataxx // TODO: exactly the same code below! switch (d) // note: order and lack of breaks intentional! { - case 0: + case 1: av[d+2](pi(i, j, (k-h).first())) = 0; av[d+2](pi(i, j, (k+h).last() )) = 0; - case 1: + case 2: av[d+1](pi(i, (j-h).first(), k)) = 0; av[d+1](pi(i, (j+h).last(), k)) = 0; - case 2: + case 0: break; default: assert(false); @@ -190,15 +190,15 @@ namespace libmpdataxx switch (d) // note: order and lack of breaks intentional! { - case 0: + case 1: av[d+2](pi(i, j, (k-h).first())) = 0; av[d+2](pi(i, j, (k+h).last() )) = 0; - case 1: + case 2: av[d+1](pi(i, (j-h).first(), k)) = 0; av[d+1](pi(i, (j+h).last(), k)) = 0; - case 2: + case 0: break; default: assert(false); diff --git a/libmpdata++/bcond/remote_3d.hpp b/libmpdata++/bcond/remote_3d.hpp index 2ec442b1..acceb4bd 100644 --- a/libmpdata++/bcond/remote_3d.hpp +++ b/libmpdata++/bcond/remote_3d.hpp @@ -5,7 +5,7 @@ #pragma once -#include +#include namespace libmpdataxx { @@ -18,21 +18,50 @@ namespace libmpdataxx dir == left && n_dims == 3 >::type - > : public detail::remote_common + > : public detail::remote_3d_common { - using parent_t = detail::remote_common; - using arr_t = blitz::Array; + using parent_t = detail::remote_3d_common; + using arr_t = typename parent_t::arr_t; + using idx_t = typename parent_t::idx_t; using parent_t::parent_t; // inheriting ctor const int off = this->is_cyclic ? 0 : -1; + void xchng ( + const arr_t &a, + const idx_t &idx_send, + const idx_t &idx_recv + ) + { + if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0 + parent_t::xchng(a, idx_send, idx_recv); + } + + void send ( + const arr_t &a, + const idx_t &idx_send + ) + { + if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0 + parent_t::send(a, idx_send); + } + + void recv ( + const arr_t &a, + const idx_t &idx_recv + ) + { + if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0 + parent_t::recv(a, idx_recv); + } + public: void fill_halos_sclr(arr_t &a, const rng_t &j, const rng_t &k, const bool deriv = false) { using namespace idxperm; - this->xchng(a, pi(this->left_intr_sclr + off, j, k), pi(this->left_halo_sclr, j, k)); + xchng(a, pi(this->left_intr_sclr + off, j, k), pi(this->left_halo_sclr, j, k)); } void fill_halos_pres(arr_t &a, const rng_t &j, const rng_t &k) @@ -52,12 +81,12 @@ namespace libmpdataxx { if(halo == 1) // see remote_2d - this->send(av[0], pi(this->left_intr_vctr + off, j, k)); // TODO: no need to receive? the vector in halo was calculated anyway? + send(av[0], pi(this->left_intr_vctr + off, j, k)); // TODO: no need to receive? the vector in halo was calculated anyway? else - this->xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); // ditto + xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); // ditto } else - this->xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); + xchng(av[0], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); } void fill_halos_sgs_div(arr_t &a, const rng_t &j, const rng_t &k) @@ -73,12 +102,12 @@ namespace libmpdataxx { if(halo == 1) // see remote_2d - this->send(av[0 + offset], pi(this->left_intr_vctr + off, j, k)); + send(av[0 + offset], pi(this->left_intr_vctr + off, j, k)); else - this->xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); + xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi((this->left_halo_vctr^h)^(-1), j, k)); } else - this->xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); + xchng(av[0 + offset], pi(this->left_intr_vctr + off, j, k), pi(this->left_halo_vctr, j, k)); } void fill_halos_sgs_tnsr(arrvec_t &av, const arr_t &, const arr_t &, const rng_t &j, const rng_t &k, const real_t) @@ -109,7 +138,7 @@ namespace libmpdataxx using namespace idxperm; assert(halo>=1); - this->xchng(a, pi(this->left_edge_sclr, j, k), pi(this->left_halo_sclr.last(), j, k)); + xchng(a, pi(this->left_edge_sclr, j, k), pi(this->left_halo_sclr.last(), j, k)); } void avg_edge_and_halo1_sclr_cyclic(arr_t &a, const rng_t &j, const rng_t &k) @@ -131,20 +160,52 @@ namespace libmpdataxx dir == rght && n_dims == 3 >::type - > : public detail::remote_common + > : public detail::remote_3d_common { - using parent_t = detail::remote_common; - using arr_t = blitz::Array; + using parent_t = detail::remote_3d_common; + using arr_t = typename parent_t::arr_t; + using idx_t = typename parent_t::idx_t; using parent_t::parent_t; // inheriting ctor const int off = this->is_cyclic ? 0 : 1; + void xchng ( + const arr_t &a, + const idx_t &idx_send, + const idx_t &idx_recv + ) + { + if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread + // if(this->thread_rank != 0) return; // temporary + parent_t::xchng(a, idx_send, idx_recv); + } + + void send ( + const arr_t &a, + const idx_t &idx_send + ) + { + if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread + // if(this->thread_rank != 0) return; // temporary + parent_t::send(a, idx_send); + } + + void recv ( + const arr_t &a, + const idx_t &idx_recv + ) + { + if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread + // if(this->thread_rank != 0) return; // temporary + parent_t::recv(a, idx_recv); + } + public: void fill_halos_sclr(arr_t &a, const rng_t &j, const rng_t &k, const bool deriv = false) { using namespace idxperm; - this->xchng(a, pi(this->rght_intr_sclr + off, j, k), pi(this->rght_halo_sclr, j, k)); + xchng(a, pi(this->rght_intr_sclr + off, j, k), pi(this->rght_halo_sclr, j, k)); } void fill_halos_pres(arr_t &a, const rng_t &j, const rng_t &k) @@ -163,12 +224,12 @@ namespace libmpdataxx if(!this->is_cyclic) { if(halo == 1) - this->recv(av[0], pi(this->rght_halo_vctr, j, k)); + recv(av[0], pi(this->rght_halo_vctr, j, k)); else - this->xchng(av[0], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); } else - this->xchng(av[0], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); } void fill_halos_sgs_div(arr_t &a, const rng_t &j, const rng_t &k) @@ -183,12 +244,12 @@ namespace libmpdataxx if(!this->is_cyclic) { if(halo == 1) - this->recv(av[0 + offset], pi(this->rght_halo_vctr, j, k)); + recv(av[0 + offset], pi(this->rght_halo_vctr, j, k)); else - this->xchng(av[0 + offset], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0 + offset], pi(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi(this->rght_halo_vctr, j, k)); } else - this->xchng(av[0 + offset], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); + xchng(av[0 + offset], pi(this->rght_intr_vctr + off, j, k), pi(this->rght_halo_vctr, j, k)); } void fill_halos_sgs_tnsr(arrvec_t &av, const arr_t &, const arr_t &, const rng_t &j, const rng_t &k, const real_t) @@ -219,7 +280,7 @@ namespace libmpdataxx using namespace idxperm; assert(halo>=1); - this->xchng(a, pi(this->rght_edge_sclr, j, k), pi(this->rght_halo_sclr.first(), j, k)); + xchng(a, pi(this->rght_edge_sclr, j, k), pi(this->rght_halo_sclr.first(), j, k)); } void avg_edge_and_halo1_sclr_cyclic(arr_t &a, const rng_t &j, const rng_t &k) diff --git a/libmpdata++/blitz.hpp b/libmpdata++/blitz.hpp index 606b0996..53273bde 100644 --- a/libmpdata++/blitz.hpp +++ b/libmpdata++/blitz.hpp @@ -112,5 +112,7 @@ namespace libmpdataxx #endif } }; -} // namespace libmpdataxx + // 3D blitz++ array storage with the k,i,j order for improved performance of sharedmem decomposition along j + const blitz::GeneralArrayStorage<3> arr3D_storage({blitz::thirdDim, blitz::firstDim, blitz::secondDim}, {true, true, true}); +} // namespace libmpdataxx diff --git a/libmpdata++/concurr/boost_thread.hpp b/libmpdata++/concurr/boost_thread.hpp index 83675a7c..f1d7c7a6 100644 --- a/libmpdata++/concurr/boost_thread.hpp +++ b/libmpdata++/concurr/boost_thread.hpp @@ -81,7 +81,7 @@ namespace libmpdataxx // ctor boost_thread(const typename solver_t::rt_params_t &p) : - parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0])) + parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction {} }; diff --git a/libmpdata++/concurr/cxx11_thread.hpp b/libmpdata++/concurr/cxx11_thread.hpp index b89a556e..77dc47d9 100644 --- a/libmpdata++/concurr/cxx11_thread.hpp +++ b/libmpdata++/concurr/cxx11_thread.hpp @@ -119,7 +119,7 @@ namespace libmpdataxx // ctor cxx11_thread(const typename solver_t::rt_params_t &p) : - parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0])) + parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction {} }; diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp index 691e4ff0..9cacedd2 100644 --- a/libmpdata++/concurr/detail/concurr_common.hpp +++ b/libmpdata++/concurr/detail/concurr_common.hpp @@ -40,6 +40,81 @@ namespace libmpdataxx { namespace detail { + template < + class real_t, + bcond::drctn_e dir, + int dim, + int n_dims, + int halo, + class bcp_t, + class mem_t + > + struct bc_set_remote_impl + { + static void _( + bcp_t &bcp, + const std::unique_ptr &mem, + const int thread_rank, + const int thread_size + ) + { + bcp.reset( + new bcond::bcond( + mem->slab(mem->grid_size[dim]), + mem->distmem.grid_size + ) + ); + } + }; + + template < + class real_t, + bcond::drctn_e dir, + int dim, + int halo, + class bcp_t, + class mem_t + > + struct bc_set_remote_impl + { + static void _( + bcp_t &bcp, + const std::unique_ptr &mem, + const int thread_rank, + const int thread_size + ) + { + bcp.reset( + new bcond::bcond( + mem->slab(mem->grid_size[dim]), + mem->distmem.grid_size, + mem->slab(mem->grid_size[1], thread_rank, thread_size), // NOTE: we assume here remote 3d bcond is only on the edges perpendicular to x + thread_rank, + thread_size + ) + ); + } + }; + + template < + class real_t, + bcond::drctn_e dir, + int dim, + int n_dims, + int halo, + class bcp_t, + class mem_t + > + void bc_set_remote( + bcp_t &bcp, + const std::unique_ptr &mem, + const int thread_rank, + const int thread_size + ) + { + bc_set_remote_impl::_(bcp, mem, thread_rank, thread_size); + } + template< class solver_t_, bcond::bcond_e bcxl, bcond::bcond_e bcxr, @@ -115,7 +190,9 @@ namespace libmpdataxx int dim > void bc_set( - typename solver_t::bcp_t &bcp + typename solver_t::bcp_t &bcp, + const int thread_rank = 0, // required only by 3D remote (MPI) bcond + const int thread_size = 0 // required only by 3D remote (MPI) bcond ) { // sanity check - polar coords do not work with MPI yet @@ -123,7 +200,7 @@ namespace libmpdataxx throw std::runtime_error("Polar boundary conditions do not work with MPI."); // distmem overrides - if (type != bcond::remote && mem->distmem.size() > 1 && dim == 0) + if (mem->distmem.size() > 1 && dim == 0) { if ( // distmem domain interior @@ -133,10 +210,19 @@ namespace libmpdataxx // cyclic condition for distmem domain (note: will not work if a non-cyclic condition is on the other end) || (type == bcond::cyclic) - ) return bc_set(bcp); + ) + { + // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?) + bc_set_remote( + bcp, + mem, + thread_rank, + thread_size + ); + return; + } } - // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?) bcp.reset( new bcond::bcond( mem->slab(mem->grid_size[dim]), @@ -153,6 +239,7 @@ namespace libmpdataxx { typename solver_t::bcp_t bxl, bxr, shrdl, shrdr; + // NOTE: for remote bcond, thread_rank set to 0 on purpose in 1D to have propre left/right message tags bc_set(bxl); bc_set(bxr); @@ -189,6 +276,7 @@ namespace libmpdataxx { typename solver_t::bcp_t bxl, bxr, byl, byr, shrdl, shrdr; + // NOTE: for remote bcond, thread_rank set to 0 on purpose in 2D to have propre left/right message tags bc_set(bxl); bc_set(bxr); @@ -216,11 +304,11 @@ namespace libmpdataxx } } - // 3D version + // 3D version, note sharedmem in y direction! void init( const typename solver_t::rt_params_t &p, const std::array &grid_size, - const int &n0, const int &n1 = 1, const int &n2 = 1 + const int &n1, const int &n0 = 1, const int &n2 = 1 ) { typename solver_t::bcp_t bxl, bxr, byl, byr, bzl, bzr, shrdl, shrdr; @@ -231,8 +319,9 @@ namespace libmpdataxx { for (int i2 = 0; i2 < n2; ++i2) { - bc_set(bxl); - bc_set(bxr); + // i1 is the local thread rank, n1 is the number of threads. These are needed by remote bcond, because only rank=0 does mpi communication + bc_set(bxl, i1, n1); + bc_set(bxr, i1, n1); bc_set(byl); bc_set(byr); @@ -246,11 +335,11 @@ namespace libmpdataxx algos.push_back( new solver_t( typename solver_t::ctor_args_t({ - i0, + i1, mem.get(), - i0 == 0 ? bxl : shrdl, - i0 == n0 - 1 ? bxr : shrdr, - byl, byr, + bxl, bxr, + i1 == 0 ? byl : shrdl, + i1 == n1 - 1 ? byr : shrdr, bzl, bzr, mem->slab(grid_size[0], i0, n0), mem->slab(grid_size[1], i1, n1), diff --git a/libmpdata++/concurr/detail/distmem.hpp b/libmpdata++/concurr/detail/distmem.hpp index 0465b830..c8d6e0b2 100644 --- a/libmpdata++/concurr/detail/distmem.hpp +++ b/libmpdata++/concurr/detail/distmem.hpp @@ -120,10 +120,13 @@ namespace libmpdataxx // init mpi here, since distmem is constructed before hdf5 // will be finalized in slvr_common dtor, since distmem is destructed before hdf5; // boost::mpi::enviro being global var caused deadlocks when freeing memory - if(!MPI::Is_initialized()) + int mpi_initialized; + MPI_Initialized(&mpi_initialized); + if(!mpi_initialized) { mpi_initialized_before = false; - MPI::Init_thread(MPI_THREAD_MULTIPLE); + int th_lvl_provided; + MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &th_lvl_provided); } if (boost::mpi::environment::thread_level() != boost::mpi::threading::multiple) { diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp index cf927d6a..39ccb7c5 100644 --- a/libmpdata++/concurr/detail/sharedmem.hpp +++ b/libmpdata++/concurr/detail/sharedmem.hpp @@ -29,7 +29,6 @@ namespace libmpdataxx > class sharedmem_common { - using arr_t = blitz::Array; static_assert(n_dims > 0, "n_dims <= 0"); static_assert(n_tlev > 0, "n_tlev <= 0"); @@ -39,6 +38,7 @@ namespace libmpdataxx protected: + using arr_t = blitz::Array; blitz::TinyVector origin; public: @@ -48,6 +48,11 @@ namespace libmpdataxx std::array grid_size; bool panic = false; // for multi-threaded SIGTERM handling + // dimension in which sharedmem domain decomposition is done + // 1D and 2D - domain decomposed in 0-th dimension (x) + // 3D - domain decomposed in 1-st dimension (y) for better workload balance in MPI runs (MPI is decomposed in x) + const int shmem_decomp_dim; + detail::distmem distmem; // TODO: these are public because used from outside in alloc - could friendship help? @@ -84,27 +89,25 @@ namespace libmpdataxx // ctors // TODO: fill reducetmp with NaNs (or use 1-element arrvec_t - it's NaN-filled by default) sharedmem_common(const std::array &grid_size, const int &size) - : n(0), distmem(grid_size), size(size) // TODO: is n(0) needed? + : n(0), distmem(grid_size), size(size), shmem_decomp_dim(n_dims < 3 ? 0 : 1) // TODO: is n(0) needed? { for (int d = 0; d < n_dims; ++d) { this->grid_size[d] = slab( rng_t(0, grid_size[d]-1), - d == 0 ? distmem.rank() : 0, + d == 0 ? distmem.rank() : 0, // decomposition along x, because that's MPI decomposition d == 0 ? distmem.size() : 1 + // d == shmem_decomp_dim ? distmem.rank() : 0, + // d == shmem_decomp_dim ? distmem.size() : 1 ); origin[d] = this->grid_size[d].first(); } - std::ostringstream oss; - oss << "grid_size[0]: " << this->grid_size[0] << " origin[0]: " << origin[0] << std::endl; - std::cerr << oss.str() << std::endl; - if (size > grid_size[0]) throw std::runtime_error("number of subdomains greater than number of gridpoints"); if (n_dims != 1) - sumtmp.reset(new blitz::Array(this->grid_size[0])); + sumtmp.reset(new blitz::Array(this->grid_size[shmem_decomp_dim])); xtmtmp.reset(new blitz::Array(size)); } @@ -113,11 +116,11 @@ namespace libmpdataxx { // doing a two-step sum to reduce numerical error // and make parallel results reproducible - for (int c = ijk[0].first(); c <= ijk[0].last(); ++c) // TODO: optimise for i.count() == 1 + for (int c = ijk[shmem_decomp_dim].first(); c <= ijk[shmem_decomp_dim].last(); ++c) // TODO: optimise for i.count() == 1 { auto slice_idx = ijk; - slice_idx.lbound(0) = c; - slice_idx.ubound(0) = c; + slice_idx.lbound(shmem_decomp_dim) = c; + slice_idx.ubound(shmem_decomp_dim) = c; if (sum_khn) (*sumtmp)(c) = blitz::kahan_sum(arr(slice_idx)); @@ -138,14 +141,14 @@ namespace libmpdataxx { // master thread calculates the sum from this process, stores in shared array if (sum_khn) - (*sumtmp)(grid_size[0].first())= blitz::kahan_sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! else - (*sumtmp)(grid_size[0].first())= blitz::sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! // master thread calculates sum of sums from all processes - (*sumtmp)(grid_size[0].first()) = this->distmem.sum((*sumtmp)(grid_size[0].first())); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[shmem_decomp_dim].first())); // inplace?! } barrier(); - double res = (*sumtmp)(grid_size[0].first()); // propagate the total sum to all threads of the process + double res = (*sumtmp)(grid_size[shmem_decomp_dim].first()); // propagate the total sum to all threads of the process barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread return res; #endif @@ -156,11 +159,11 @@ namespace libmpdataxx { // doing a two-step sum to reduce numerical error // and make parallel results reproducible - for (int c = ijk[0].first(); c <= ijk[0].last(); ++c) + for (int c = ijk[shmem_decomp_dim].first(); c <= ijk[shmem_decomp_dim].last(); ++c) { auto slice_idx = ijk; - slice_idx.lbound(0) = c; - slice_idx.ubound(0) = c; + slice_idx.lbound(shmem_decomp_dim) = c; + slice_idx.ubound(shmem_decomp_dim) = c; if (sum_khn) (*sumtmp)(c) = blitz::kahan_sum(arr1(slice_idx) * arr2(slice_idx)); @@ -182,14 +185,14 @@ namespace libmpdataxx { // master thread calculates the sum from this process, stores in shared array if (sum_khn) - (*sumtmp)(grid_size[0].first())= blitz::kahan_sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?! else - (*sumtmp)(grid_size[0].first())= blitz::sum(*sumtmp); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::sum(*sumtmp); // inplace?! // master thread calculates sum of sums from all processes - (*sumtmp)(grid_size[0].first()) = this->distmem.sum((*sumtmp)(grid_size[0].first())); // inplace?! + (*sumtmp)(grid_size[shmem_decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[shmem_decomp_dim].first())); // inplace?! } barrier(); - double res = (*sumtmp)(grid_size[0].first()); // propagate the total sum to all threads of the process + double res = (*sumtmp)(grid_size[shmem_decomp_dim].first()); // propagate the total sum to all threads of the process barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread return res; #endif @@ -267,7 +270,7 @@ namespace libmpdataxx boost::ptr_vector tobefreed; public: - arr_t *never_delete(arr_t *arg) + virtual arr_t *never_delete(arr_t *arg) { arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData); ret->reindexSelf(arg->base()); @@ -277,7 +280,7 @@ namespace libmpdataxx arr_t *old(arr_t *arg) { tobefreed.push_back(arg); - arr_t *ret = never_delete(arg); + arr_t *ret = this->never_delete(arg); return ret; } @@ -575,10 +578,18 @@ namespace libmpdataxx class sharedmem : public sharedmem_common { using parent_t = sharedmem_common; + using arr_t = typename parent_t::arr_t; using parent_t::parent_t; // inheriting ctors public: + virtual arr_t *never_delete(arr_t *arg) override + { + arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, blitz::GeneralArrayStorage<3>(arg->ordering(), {true, true, true})); + ret->reindexSelf(arg->base()); + return ret; + } + blitz::Array advectee(int e = 0) { assert(this->n < n_tlev); diff --git a/libmpdata++/concurr/openmp.hpp b/libmpdata++/concurr/openmp.hpp index 24f7c78b..d33cda98 100644 --- a/libmpdata++/concurr/openmp.hpp +++ b/libmpdata++/concurr/openmp.hpp @@ -77,7 +77,7 @@ namespace libmpdataxx // ctor openmp(const typename solver_t::rt_params_t &p) : - parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0])) + parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction {} }; diff --git a/libmpdata++/output/hdf5.hpp b/libmpdata++/output/hdf5.hpp index 553cedef..89c8013c 100644 --- a/libmpdata++/output/hdf5.hpp +++ b/libmpdata++/output/hdf5.hpp @@ -273,8 +273,10 @@ namespace libmpdataxx } case 3: { - typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); - contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); // create a copy that is contiguous + // create a copy that is contiguous and has the C-style (kji) storage order as required by HDF5 + typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); + contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); + dset.write(contiguous_arr.data(), flttype_solver, H5::DataSpace(parent_t::n_dims, srfcshape.data()), space, dxpl_id); break; } @@ -304,7 +306,10 @@ namespace libmpdataxx } case 3: { - typename solver_t::arr_t contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]).copy(); // create a copy that is contiguous + // create a copy that is contiguous and has the C-style (kji) storage order as required by HDF5 + typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]); + contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]); + dset.write(contiguous_arr.data(), flttype_solver, H5::DataSpace(parent_t::n_dims, shape.data()), space, dxpl_id); break; } @@ -312,7 +317,7 @@ namespace libmpdataxx }; } - // data is assumed to be contiguous and in the same layout as hdf variable + // data is assumed to be contiguous and in the same layout as hdf variable and in the C-style storage order void record_aux_hlpr(const std::string &name, typename solver_t::real_t *data, H5::H5File hdf) { assert(this->rank == 0); @@ -329,6 +334,7 @@ namespace libmpdataxx aux.write(data, flttype_solver, H5::DataSpace(parent_t::n_dims, shape.data()), space, dxpl_id); } + // data is assumed to be contiguous and in the same layout as hdf variable and in the C-style storage order void record_aux(const std::string &name, typename solver_t::real_t *data) { record_aux_hlpr(name, data, *hdfp); diff --git a/libmpdata++/solvers/detail/mpdata_osc_2d.hpp b/libmpdata++/solvers/detail/mpdata_osc_2d.hpp index f86e9927..70b53d60 100644 --- a/libmpdata++/solvers/detail/mpdata_osc_2d.hpp +++ b/libmpdata++/solvers/detail/mpdata_osc_2d.hpp @@ -230,7 +230,7 @@ namespace libmpdataxx const typename parent_t::rt_params_t &p ) : parent_t(args, p), - im(args.i.first() - 1, args.i.last()), + im(this->rank == 0 ? args.i.first() - 1 : args.i.first(), args.i.last()), jm(args.j.first() - 1, args.j.last()) { } }; diff --git a/libmpdata++/solvers/detail/mpdata_osc_3d.hpp b/libmpdata++/solvers/detail/mpdata_osc_3d.hpp index 8c7901e8..a2eab480 100644 --- a/libmpdata++/solvers/detail/mpdata_osc_3d.hpp +++ b/libmpdata++/solvers/detail/mpdata_osc_3d.hpp @@ -254,7 +254,7 @@ namespace libmpdataxx ) : parent_t(args, p), im(args.i.first() - 1, args.i.last()), - jm(args.j.first() - 1, args.j.last()), + jm(this->rank == 0 ? args.j.first() - 1 : args.j.first(), args.j.last()), km(args.k.first() - 1, args.k.last()) { } }; diff --git a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp index be1161df..07d15f50 100644 --- a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp +++ b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp @@ -57,7 +57,7 @@ namespace libmpdataxx // ijkm with non-overlapping ranges idx_t ijkm_sep; - // like ijk, but with range in x direction extended by 1 to the left for rank=0 for MPI compliance. + // like ijk, but with range in x direction extended by 1 to the left for MPI compliance. // MPI requires that vector between two process domains is calculated by the process to the right of it (c.f. remote_2d.hpp fill_halos_vctr_alng) // TODO: change MPI logic to assume that it is calculated by the process to the left? then, ijk_vec would not be needed(?) std::array ijk_vec; @@ -202,14 +202,26 @@ namespace libmpdataxx ijk_vec[d] = rng_t(this->ijk[d].first(), this->ijk[d].last()); } - if (this->rank == 0) + + if(ct_params_t::n_dims < 3 && this->rank == 0 // 1D and 2D - sharedmem in x direction + || + ct_params_t::n_dims == 3 // 3D - sharedmem in y direction + ) ijk_vec[0] = rng_t(this->ijk[0].first() - 1, this->ijk[0].last()); ijkm_sep = ijkm; if (this->rank > 0) { - ijkm_sep.lbound()(0) = this->ijk[0].first(); - ijkm_sep.ubound()(0) = this->ijk[0].last(); + if(ct_params_t::n_dims < 3) + { + ijkm_sep.lbound()(0) = this->ijk[0].first(); + ijkm_sep.ubound()(0) = this->ijk[0].last(); + } + else + { + ijkm_sep.lbound()(1) = this->ijk[1].first(); + ijkm_sep.ubound()(1) = this->ijk[1].last(); + } } } diff --git a/libmpdata++/solvers/detail/solver_2d.hpp b/libmpdata++/solvers/detail/solver_2d.hpp index 2a95b563..c556e5cc 100644 --- a/libmpdata++/solvers/detail/solver_2d.hpp +++ b/libmpdata++/solvers/detail/solver_2d.hpp @@ -170,6 +170,7 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], sign); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[0], sign); this->mem->barrier(); @@ -180,6 +181,7 @@ namespace libmpdataxx const idx_t<2> &range_ijk ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1]); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[0]); this->mem->barrier(); diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp index b7a3825a..e2b0a3be 100644 --- a/libmpdata++/solvers/detail/solver_3d.hpp +++ b/libmpdata++/solvers/detail/solver_3d.hpp @@ -26,6 +26,11 @@ namespace libmpdataxx { using parent_t = solver_common; + void barrier_if_single_threaded_bc0() + { + if(this->bcs[0].at(0)->single_threaded || this->bcs[0].at(1)->single_threaded) this->mem->barrier(); + } + public: using real_t = typename ct_params_t::real_t; @@ -43,11 +48,12 @@ namespace libmpdataxx const bool deriv = false ) final // for a given array { - const auto range_ijk_0__ext = this->extend_range(range_ijk[0], ext); + const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv); - for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk_0__ext, deriv); - for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk_0__ext, range_ijk[1]^ext, deriv); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv); + for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv); this->mem->barrier(); } void xchng(int e) final @@ -60,25 +66,31 @@ namespace libmpdataxx this->mem->barrier(); if (!cyclic) { - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); + // order matters! see open_3d bcond for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad); } else { for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad); for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad); } this->mem->barrier(); + } virtual void xchng_flux(arrvec_t &arrvec) final { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i); for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j); + this->mem->barrier(); } virtual void xchng_sgs_div( @@ -89,6 +101,7 @@ namespace libmpdataxx this->mem->barrier(); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_div(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_div(arr, range_ijk[2]^h, range_ijk[0]); + barrier_if_single_threaded_bc0(); // to make sure that all threads have filled range_ijk[2]^h before it is used to fill along x for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_div(arr, range_ijk[1], range_ijk[2]^h); this->mem->barrier(); } @@ -100,6 +113,7 @@ namespace libmpdataxx { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]); this->mem->barrier(); @@ -113,6 +127,7 @@ namespace libmpdataxx { this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]); for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]); this->mem->barrier(); @@ -131,6 +146,7 @@ namespace libmpdataxx bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3); bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4); } + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) { @@ -154,8 +170,8 @@ namespace libmpdataxx ) final { this->mem->barrier(); - const auto range_ijk_0__ext_h = this->extend_range(range_ijk[0], ext, h); - const auto range_ijk_0__ext_1 = this->extend_range(range_ijk[0], ext, 1); + const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h); + const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1); if (!cyclic) { for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); @@ -169,24 +185,36 @@ namespace libmpdataxx this->mem->barrier(); } - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk_0__ext_h, range_ijk[1]^ext^1); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); + + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1) : bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + barrier_if_single_threaded_bc0(); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_0__ext_1, range_ijk[1]^ext^h); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk_0__ext_1); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + barrier_if_single_threaded_bc0(); + + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } else { - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk_0__ext_h); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk_0__ext_h, range_ijk[1]^ext^1); + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1); + + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1) : bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1); + barrier_if_single_threaded_bc0(); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1); - for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_0__ext_1, range_ijk[1]^ext^h); + for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h); - for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h); - for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk_0__ext_1); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h); + barrier_if_single_threaded_bc0(); + + for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1); } this->mem->barrier(); } @@ -197,11 +225,12 @@ namespace libmpdataxx const int ext = 0 ) final { - const auto range_ijk_0__ext = this->extend_range(range_ijk[0], ext); + const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext); this->mem->barrier(); - for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext); - for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk_0__ext); - for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk_0__ext, range_ijk[1]^ext); + for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext); + barrier_if_single_threaded_bc0(); + for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext); + for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext); this->mem->barrier(); } @@ -211,7 +240,9 @@ namespace libmpdataxx const int &sign ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign); for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign); this->mem->barrier(); @@ -222,7 +253,9 @@ namespace libmpdataxx const idx_t<3> &range_ijk ) final { + this->mem->barrier(); for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]); this->mem->barrier(); @@ -233,14 +266,18 @@ namespace libmpdataxx ) final { this->mem->barrier(); + for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]); + barrier_if_single_threaded_bc0(); for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]); for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]); for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]); + this->mem->barrier(); } @@ -339,6 +376,7 @@ namespace libmpdataxx this->set_bcs(2, args.bczl, args.bczr); } + public: static void alloc( @@ -353,24 +391,28 @@ namespace libmpdataxx mem->psi[e].push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); // Courant field components (Arakawa-C grid) mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); mem->GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_vctr(mem->grid_size[2]) + parent_t::rng_vctr(mem->grid_size[2]), + arr3D_storage ))); // fully third-order accurate mpdata needs also time derivatives of @@ -382,33 +424,39 @@ namespace libmpdataxx mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_vctr(mem->grid_size[2]) + parent_t::rng_vctr(mem->grid_size[2]), + arr3D_storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_vctr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_vctr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_vctr(mem->grid_size[2]) + parent_t::rng_vctr(mem->grid_size[2]), + arr3D_storage ))); } @@ -417,7 +465,8 @@ namespace libmpdataxx mem->G.reset(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); // allocate Kahan summation temporary vars @@ -426,7 +475,8 @@ namespace libmpdataxx mem->khn_tmp.push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); // courant field alloc_tmp_sclr(mem, __FILE__, 1); @@ -449,7 +499,8 @@ namespace libmpdataxx stgr[n][1] ? parent_t::rng_vctr(mem->grid_size[1]) : parent_t::rng_sclr(mem->grid_size[1]), srfc ? rng_t(0, 0) : stgr[n][2] ? parent_t::rng_vctr(mem->grid_size[2]) : - parent_t::rng_sclr(mem->grid_size[2]) + parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); } } @@ -479,7 +530,8 @@ namespace libmpdataxx mem->tmp[__file__].back().push_back(mem->old(new typename parent_t::arr_t( parent_t::rng_sclr(mem->grid_size[0]), parent_t::rng_sclr(mem->grid_size[1]), - srfc ? rng_t(0, 0) : parent_t::rng_sclr(mem->grid_size[2]) + srfc ? rng_t(0, 0) : parent_t::rng_sclr(mem->grid_size[2]), + arr3D_storage ))); } }; diff --git a/libmpdata++/solvers/detail/solver_common.hpp b/libmpdata++/solvers/detail/solver_common.hpp index c70cb54e..5aab5f07 100644 --- a/libmpdata++/solvers/detail/solver_common.hpp +++ b/libmpdata++/solvers/detail/solver_common.hpp @@ -133,7 +133,7 @@ namespace libmpdataxx scale(e, -ct_params_t::hint_scale(e)); } - // thread-aware range extension + // thread-aware range extension, messes range guessing in remote_3d bcond template rng_t extend_range(const rng_t &r, const n_t n) const { @@ -255,7 +255,7 @@ namespace libmpdataxx // TODO: MPI standard requires that the same thread that called mpi_init // calls mpi_finalize, we don't ensure it if(!libmpdataxx::concurr::detail::mpi_initialized_before && rank==0) - MPI::Finalize(); + MPI_Finalize(); #endif #if !defined(NDEBUG) assert(hook_ante_step_called && "any overriding hook_ante_step() must call parent_t::hook_ante_step()");