diff --git a/libmpdata++/bcond/detail/bcond_common.hpp b/libmpdata++/bcond/detail/bcond_common.hpp
index 1e9c6df2..92cf4445 100644
--- a/libmpdata++/bcond/detail/bcond_common.hpp
+++ b/libmpdata++/bcond/detail/bcond_common.hpp
@@ -193,6 +193,8 @@ namespace libmpdataxx
         virtual void avg_edge_and_halo1_sclr_cyclic(arr_3d_t &, const rng_t &, const rng_t &)
         {};
 
+        const bool single_threaded;
+
         protected:
           // sclr
         int
@@ -207,7 +209,7 @@ namespace libmpdataxx
         public:
 
         // ctor
-        bcond_common(const rng_t &i, const std::array<int, n_dims> &) :
+        bcond_common(const rng_t &i, const std::array<int, n_dims> &, bool single_threaded = false) :
           // sclr
           left_edge_sclr(
             i.first()
@@ -247,7 +249,8 @@ namespace libmpdataxx
           rght_intr_vctr(
             (i^h^(-1)).last() - (halo - 1),
             (i^h^(-1)).last()
-          )
+          ),
+          single_threaded(single_threaded)
         {}
 
         // the one for use in shared
diff --git a/libmpdata++/bcond/detail/polar_common.hpp b/libmpdata++/bcond/detail/polar_common.hpp
index ebdb8326..193cf72a 100644
--- a/libmpdata++/bcond/detail/polar_common.hpp
+++ b/libmpdata++/bcond/detail/polar_common.hpp
@@ -37,10 +37,10 @@ namespace libmpdataxx
         // ctor
         polar_common(
           const rng_t &i,
-          const std::array<int, n_dims> &grid_size
+          const std::array<int, n_dims> &distmem_grid_size
         ) :
-          parent_t(i, grid_size),
-          pole((grid_size[0] - 1) / 2)
+          parent_t(i, distmem_grid_size),
+          pole((distmem_grid_size[0] - 1) / 2)
         {}
       };
     } // namespace detail
diff --git a/libmpdata++/bcond/detail/remote_3d_common.hpp b/libmpdata++/bcond/detail/remote_3d_common.hpp
new file mode 100644
index 00000000..6aaf0aa3
--- /dev/null
+++ b/libmpdata++/bcond/detail/remote_3d_common.hpp
@@ -0,0 +1,116 @@
+// common code for ``remote'' MPI boundary conditions for libmpdata++
+//
+// licensing: GPU GPL v3
+// copyright: University of Warsaw
+
+#pragma once
+
+#include <libmpdata++/bcond/detail/remote_common.hpp>
+
+namespace libmpdataxx
+{
+  namespace bcond
+  {
+    namespace detail
+    {
+      template <typename real_t, int halo, drctn_e dir>
+      class remote_3d_common : public remote_common<real_t, halo, dir, 3>
+      {
+        using parent_t = detail::remote_common<real_t, halo, dir, 3>;
+
+        protected:
+
+        using arr_t = typename parent_t::arr_t;
+        using idx_t = typename parent_t::idx_t;
+
+        const int thread_rank, thread_size;
+
+        private:
+
+        const rng_t thread_j;
+        const int grid_size_y;
+
+        // try to guess what should be the whole domain exchanged by this process
+        // based on the difference between idx to be sent by this thread and idx of this process
+        idx_t extend_idx(idx_t idx)
+        {
+//std::cerr << "extend idx start idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl;
+          idx.lbound(1) = 0                 + idx.lbound(1) - thread_j.first(); // does it have to start at 0?
+          idx.ubound(1) = (grid_size_y - 1) + idx.ubound(1) - thread_j.last();  // does it have to end at grid_size_y - 1?
+//std::cerr << "extend idx end idx(1): " << idx.lbound(1) << ", " << idx.ubound(1) << std::endl;
+          return idx;
+        }
+
+        public:
+
+        void xchng (
+          const arr_t &a,
+          const idx_t &idx_send,
+          const idx_t &idx_recv
+        )
+        {
+          parent_t::xchng(a, extend_idx(idx_send), extend_idx(idx_recv));
+        }
+
+        void send (
+          const arr_t &a,
+          const idx_t &idx_send
+        )
+        {
+          parent_t::send(a, extend_idx(idx_send));
+        }
+
+        void recv (
+          const arr_t &a,
+          const idx_t &idx_recv
+        )
+        {
+          parent_t::recv(a, extend_idx(idx_recv));
+        }
+
+        // ctor
+        remote_3d_common(
+          const rng_t &i,
+          const std::array<int, 3> &distmem_grid_size,
+          const rng_t _thread_j,
+          const int thread_rank,
+          const int thread_size
+        ) :
+          parent_t(i, distmem_grid_size, true), // true indicating that this is a bcond done with a single thread
+          thread_rank(thread_rank),
+          thread_size(thread_size),
+          thread_j(_thread_j),
+          grid_size_y(distmem_grid_size[1])
+        {
+#if defined(USE_MPI)
+          // only 2 threads do mpi, others don't need buffers
+          if(thread_rank != 0 && thread_rank != thread_size-1)
+          {
+            free(parent_t::buf_send);
+            free(parent_t::buf_recv);
+          }
+//std::cerr << "remote_3d_common ctor thread_j: " << thread_j.lbound(0) << ", " << thread_j.ubound(0) << std::endl;
+//std::cerr << "remote_3d_common ctor _thread_j: " << _thread_j.lbound(0) << ", " << _thread_j.ubound(0) << std::endl;
+//std::cerr << "remote_3d_common ctor f-l thread_j: " << thread_j.first() << ", " << thread_j.last() << std::endl;
+//std::cerr << "remote_3d_common ctor f-l _thread_j: " << _thread_j.first() << ", " << _thread_j.last() << std::endl;
+#endif
+        }
+
+        // dtor
+        ~remote_3d_common()
+        {
+#if defined(USE_MPI)
+          if(thread_rank == 0 || thread_rank == thread_size-1)
+          {
+            free(parent_t::buf_send);
+            free(parent_t::buf_recv);
+          }
+          // hack to make free in ~remote_common give defined behaviour
+          parent_t::buf_send = nullptr;
+          parent_t::buf_recv = nullptr;
+#endif
+        }
+      };
+    };
+  } // namespace bcond
+} // namespace libmpdataxx
diff --git a/libmpdata++/bcond/detail/remote_common.hpp b/libmpdata++/bcond/detail/remote_common.hpp
index 2d2cfba7..034b33a5 100644
--- a/libmpdata++/bcond/detail/remote_common.hpp
+++ b/libmpdata++/bcond/detail/remote_common.hpp
@@ -29,21 +29,22 @@ namespace libmpdataxx
         using arr_t = blitz::Array<real_t, n_dims>;
         using idx_t = blitz::RectDomain<n_dims>;
 
-        private:
+        real_t *buf_send,
+               *buf_recv;
 
-        const int grid_size_0;
+        private:
 
 #if defined(USE_MPI)
         boost::mpi::communicator mpicom;
-        real_t *buf_send,
-               *buf_recv;
 
 #  if defined(NDEBUG)
         static const int n_reqs = 2; // data, reqs for recv only is enough?
-        static const int n_dbg_reqs = 0;
+        static const int n_dbg_send_reqs = 0;
+        static const int n_dbg_tags = 0;
 #  else
         static const int n_reqs = 4; // data + ranges
-        static const int n_dbg_reqs = 1;
+        static const int n_dbg_send_reqs = 1;
+        static const int n_dbg_tags = 2;
 #  endif
 
         std::array<boost::mpi::request, n_reqs> reqs;
@@ -53,7 +54,6 @@ namespace libmpdataxx
           : (mpicom.rank() + 1                ) % mpicom.size();
 
 #  if !defined(NDEBUG)
-          const int debug = 2;
           std::pair<int, int> buf_rng;
 #  endif
 #endif
@@ -78,6 +78,12 @@ namespace libmpdataxx
           const int
             msg_send = dir == left ? left : rght;
 
+//          std::cerr << "send_hlpr idx dir " << dir << " : " 
+//            << " (" << idx_send.lbound(0) << ", " << idx_send.ubound(0) << ")"  
+//            << " (" << idx_send.lbound(1) << ", " << idx_send.ubound(1) << ")"  
+//            << " (" << idx_send.lbound(2) << ", " << idx_send.ubound(2) << ")"  
+//            << std::endl;
+
           // arr_send references part of the send buffer that will be used
           arr_t arr_send(buf_send, a(idx_send).shape(), blitz::neverDeleteData);
           // copying data to be sent
@@ -92,7 +98,7 @@ namespace libmpdataxx
 
             // sending debug information
 #  if !defined(NDEBUG)
-            reqs[1] = mpicom.isend(peer, msg_send ^ debug, std::pair<int,int>(
+            reqs[1] = mpicom.isend(peer, msg_send + n_dbg_tags, std::pair<int,int>(
               idx_send[0].first(),
               idx_send[0].last()
             ));
@@ -112,15 +118,21 @@ namespace libmpdataxx
           const int
             msg_recv = dir == left ? rght : left;
 
+//          std::cerr << "recv_hlpr idx dir " << dir << " : " 
+//            << " (" << idx_recv.lbound(0) << ", " << idx_recv.ubound(0) << ")"  
+//            << " (" << idx_recv.lbound(1) << ", " << idx_recv.ubound(1) << ")"  
+//            << " (" << idx_recv.lbound(2) << ", " << idx_recv.ubound(2) << ")"  
+//            << std::endl;
+
 
           // launching async data transfer
           if(a(idx_recv).size()!=0) // TODO: test directly size of idx_recv
           {
-            reqs[1+n_dbg_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size());
+            reqs[1+n_dbg_send_reqs] = mpicom.irecv(peer, msg_recv, buf_recv, a(idx_recv).size());
 
             // sending debug information
 #  if !defined(NDEBUG)
-            reqs[3] = mpicom.irecv(peer, msg_recv ^ debug, buf_rng);
+            reqs[3] = mpicom.irecv(peer, msg_recv + n_dbg_tags, buf_rng);
 #  endif
           }
 #else
@@ -137,7 +149,7 @@ namespace libmpdataxx
           send_hlpr(a, idx_send);
 
           // waiting for the transfers to finish
-          boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_reqs); // MPI_Waitall is thread-safe?
+          boost::mpi::wait_all(reqs.begin(), reqs.begin() + 1 + n_dbg_send_reqs); // MPI_Waitall is thread-safe?
 #else
           assert(false);
 #endif
@@ -153,7 +165,7 @@ namespace libmpdataxx
           recv_hlpr(a, idx_recv);
 
           // waiting for the transfers to finish
-          boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_reqs, reqs.end()); // MPI_Waitall is thread-safe?
+          boost::mpi::wait_all(reqs.begin() + 1 + n_dbg_send_reqs, reqs.end()); // MPI_Waitall is thread-safe?
 
           // a blitz handler for the used part of the receive buffer
           arr_t arr_recv(buf_recv, a(idx_recv).shape(), blitz::neverDeleteData); // TODO: shape directly from idx_recv
@@ -207,13 +219,21 @@ namespace libmpdataxx
         // ctor
         remote_common(
           const rng_t &i,
-          const std::array<int, n_dims> &grid_size
+          const std::array<int, n_dims> &distmem_grid_size,
+          bool single_threaded = false
         ) :
-          parent_t(i, grid_size),
-          grid_size_0(grid_size[0])
+          parent_t(i, distmem_grid_size, single_threaded)
         {
 #if defined(USE_MPI)
-          const int slice_size = n_dims==1 ? 1 : (n_dims==2? grid_size[1]+6 : (grid_size[1]+6) * (grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides
+  
+          const int slice_size = n_dims==1 ? 1 : (n_dims==2? distmem_grid_size[1]+6 : (distmem_grid_size[1]+6) * (distmem_grid_size[2]+6) ); // 3 is the max halo size (?), so 6 on both sides
+//std::cerr << "remote_common ctor, " 
+//  << " distmem_grid_size[0]: " << distmem_grid_size[0]
+//  << " distmem_grid_size[1]: " << distmem_grid_size[1]
+//  << " distmem_grid_size[2]: " << distmem_grid_size[2]
+//  << " slice_size: " << slice_size 
+//  << " halo: " << halo
+//  << std::endl;
           // allocate enough memory in buffers to store largest halos to be sent
           buf_send = (real_t *) malloc(halo * slice_size * sizeof(real_t));
           buf_recv = (real_t *) malloc(halo * slice_size * sizeof(real_t));
@@ -232,3 +252,4 @@ namespace libmpdataxx
     }
   } // namespace bcond
 } // namespace libmpdataxx
+
diff --git a/libmpdata++/bcond/open_3d.hpp b/libmpdata++/bcond/open_3d.hpp
index b621748a..da499cfb 100644
--- a/libmpdata++/bcond/open_3d.hpp
+++ b/libmpdata++/bcond/open_3d.hpp
@@ -78,15 +78,15 @@ namespace libmpdataxx
         // TODO: exactly the same code below!
         switch (d) // note: order and lack of breaks intentional!
         {
-          case 0:
+          case 1:
           av[d+2](pi<d>(i, j, (k-h).first())) = 0;
           av[d+2](pi<d>(i, j, (k+h).last() )) = 0;
 
-          case 1:
+          case 2:
           av[d+1](pi<d>(i, (j-h).first(), k)) = 0;
           av[d+1](pi<d>(i, (j+h).last(),  k)) = 0;
 
-          case 2:
+          case 0:
           break;
 
           default: assert(false);
@@ -190,15 +190,15 @@ namespace libmpdataxx
 
         switch (d) // note: order and lack of breaks intentional!
         {
-          case 0:
+          case 1:
           av[d+2](pi<d>(i, j, (k-h).first())) = 0;
           av[d+2](pi<d>(i, j, (k+h).last() )) = 0;
 
-          case 1:
+          case 2:
           av[d+1](pi<d>(i, (j-h).first(), k)) = 0;
           av[d+1](pi<d>(i, (j+h).last(),  k)) = 0;
 
-          case 2:
+          case 0:
           break;
 
           default: assert(false);
diff --git a/libmpdata++/bcond/remote_3d.hpp b/libmpdata++/bcond/remote_3d.hpp
index 2ec442b1..acceb4bd 100644
--- a/libmpdata++/bcond/remote_3d.hpp
+++ b/libmpdata++/bcond/remote_3d.hpp
@@ -5,7 +5,7 @@
 
 #pragma once
 
-#include <libmpdata++/bcond/detail/remote_common.hpp>
+#include <libmpdata++/bcond/detail/remote_3d_common.hpp>
 
 namespace libmpdataxx
 {
@@ -18,21 +18,50 @@ namespace libmpdataxx
         dir == left   &&
         n_dims == 3
       >::type
-    > : public detail::remote_common<real_t, halo, dir, n_dims>
+    > : public detail::remote_3d_common<real_t, halo, dir>
     {
 
-      using parent_t = detail::remote_common<real_t, halo, dir, n_dims>;
-      using arr_t = blitz::Array<real_t, 3>;
+      using parent_t = detail::remote_3d_common<real_t, halo, dir>;
+      using arr_t = typename parent_t::arr_t;
+      using idx_t = typename parent_t::idx_t;
       using parent_t::parent_t; // inheriting ctor
 
       const int off = this->is_cyclic ? 0 : -1;
 
+      void xchng (
+        const arr_t &a,
+        const idx_t &idx_send,
+        const idx_t &idx_recv
+      )
+      {
+        if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0
+        parent_t::xchng(a, idx_send, idx_recv);
+      }
+
+      void send (
+        const arr_t &a,
+        const idx_t &idx_send
+      )
+      {
+        if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0
+        parent_t::send(a, idx_send);
+      }
+
+      void recv (
+        const arr_t &a,
+        const idx_t &idx_recv
+      )
+      {
+        if(this->thread_rank!=0) return; // left MPI calls only done by thread rank 0
+        parent_t::recv(a, idx_recv);
+      }
+
       public:
 
       void fill_halos_sclr(arr_t &a, const rng_t &j, const rng_t &k, const bool deriv = false)
       {
         using namespace idxperm;
-        this->xchng(a, pi<d>(this->left_intr_sclr + off, j, k), pi<d>(this->left_halo_sclr, j, k));
+        xchng(a, pi<d>(this->left_intr_sclr + off, j, k), pi<d>(this->left_halo_sclr, j, k));
       }
 
       void fill_halos_pres(arr_t &a, const rng_t &j, const rng_t &k)
@@ -52,12 +81,12 @@ namespace libmpdataxx
         {
           if(halo == 1)
             // see remote_2d
-            this->send(av[0], pi<d>(this->left_intr_vctr + off, j, k)); // TODO: no need to receive? the vector in halo was calculated anyway?
+            send(av[0], pi<d>(this->left_intr_vctr + off, j, k)); // TODO: no need to receive? the vector in halo was calculated anyway?
           else
-            this->xchng(av[0], pi<d>(this->left_intr_vctr + off, j, k), pi<d>((this->left_halo_vctr^h)^(-1), j, k)); // ditto
+            xchng(av[0], pi<d>(this->left_intr_vctr + off, j, k), pi<d>((this->left_halo_vctr^h)^(-1), j, k)); // ditto
         }
         else
-          this->xchng(av[0], pi<d>(this->left_intr_vctr + off, j, k), pi<d>(this->left_halo_vctr, j, k));
+          xchng(av[0], pi<d>(this->left_intr_vctr + off, j, k), pi<d>(this->left_halo_vctr, j, k));
       }
 
       void fill_halos_sgs_div(arr_t &a, const rng_t &j, const rng_t &k)
@@ -73,12 +102,12 @@ namespace libmpdataxx
         {
           if(halo == 1)
             // see remote_2d
-            this->send(av[0 + offset], pi<d>(this->left_intr_vctr + off, j, k));
+            send(av[0 + offset], pi<d>(this->left_intr_vctr + off, j, k));
           else
-            this->xchng(av[0 + offset], pi<d>(this->left_intr_vctr + off, j, k), pi<d>((this->left_halo_vctr^h)^(-1), j, k));
+            xchng(av[0 + offset], pi<d>(this->left_intr_vctr + off, j, k), pi<d>((this->left_halo_vctr^h)^(-1), j, k));
         }
         else
-          this->xchng(av[0 + offset], pi<d>(this->left_intr_vctr + off, j, k), pi<d>(this->left_halo_vctr, j, k));
+          xchng(av[0 + offset], pi<d>(this->left_intr_vctr + off, j, k), pi<d>(this->left_halo_vctr, j, k));
       }
 
       void fill_halos_sgs_tnsr(arrvec_t<arr_t> &av, const arr_t &, const arr_t &, const rng_t &j, const rng_t &k, const real_t)
@@ -109,7 +138,7 @@ namespace libmpdataxx
 
         using namespace idxperm;
         assert(halo>=1);
-        this->xchng(a, pi<d>(this->left_edge_sclr, j, k), pi<d>(this->left_halo_sclr.last(), j, k));
+        xchng(a, pi<d>(this->left_edge_sclr, j, k), pi<d>(this->left_halo_sclr.last(), j, k));
       }
 
       void avg_edge_and_halo1_sclr_cyclic(arr_t &a, const rng_t &j, const rng_t &k)
@@ -131,20 +160,52 @@ namespace libmpdataxx
         dir == rght   &&
         n_dims == 3
       >::type
-    > : public detail::remote_common<real_t, halo, dir, n_dims>
+    > : public detail::remote_3d_common<real_t, halo, dir>
     {
-      using parent_t = detail::remote_common<real_t, halo, dir, n_dims>;
-      using arr_t = blitz::Array<real_t, 3>;
+      using parent_t = detail::remote_3d_common<real_t, halo, dir>;
+      using arr_t = typename parent_t::arr_t;
+      using idx_t = typename parent_t::idx_t;
       using parent_t::parent_t; // inheriting ctor
 
       const int off = this->is_cyclic ? 0 : 1;
 
+      void xchng (
+        const arr_t &a,
+        const idx_t &idx_send,
+        const idx_t &idx_recv
+      )
+      {
+        if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread
+      //  if(this->thread_rank != 0) return; // temporary 
+        parent_t::xchng(a, idx_send, idx_recv);
+      }
+
+      void send (
+        const arr_t &a,
+        const idx_t &idx_send
+      )
+      {
+        if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread
+       // if(this->thread_rank != 0) return; // temporary 
+        parent_t::send(a, idx_send);
+      }
+
+      void recv (
+        const arr_t &a,
+        const idx_t &idx_recv
+      )
+      {
+        if(this->thread_rank != this->thread_size-1) return; // right MPI calls only done by the highest ranked thread
+       // if(this->thread_rank != 0) return; // temporary 
+        parent_t::recv(a, idx_recv);
+      }
+
       public:
 
       void fill_halos_sclr(arr_t &a, const rng_t &j, const rng_t &k, const bool deriv = false)
       {
         using namespace idxperm;
-        this->xchng(a, pi<d>(this->rght_intr_sclr + off, j, k), pi<d>(this->rght_halo_sclr, j, k));
+        xchng(a, pi<d>(this->rght_intr_sclr + off, j, k), pi<d>(this->rght_halo_sclr, j, k));
       }
 
       void fill_halos_pres(arr_t &a, const rng_t &j, const rng_t &k)
@@ -163,12 +224,12 @@ namespace libmpdataxx
         if(!this->is_cyclic)
         {
           if(halo == 1)
-            this->recv(av[0], pi<d>(this->rght_halo_vctr, j, k));
+            recv(av[0], pi<d>(this->rght_halo_vctr, j, k));
           else
-            this->xchng(av[0], pi<d>(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi<d>(this->rght_halo_vctr, j, k));
+            xchng(av[0], pi<d>(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi<d>(this->rght_halo_vctr, j, k));
         }
         else
-          this->xchng(av[0], pi<d>(this->rght_intr_vctr + off, j, k), pi<d>(this->rght_halo_vctr, j, k));
+          xchng(av[0], pi<d>(this->rght_intr_vctr + off, j, k), pi<d>(this->rght_halo_vctr, j, k));
       }
 
       void fill_halos_sgs_div(arr_t &a, const rng_t &j, const rng_t &k)
@@ -183,12 +244,12 @@ namespace libmpdataxx
         if(!this->is_cyclic)
         {
           if(halo == 1)
-            this->recv(av[0 + offset], pi<d>(this->rght_halo_vctr, j, k));
+            recv(av[0 + offset], pi<d>(this->rght_halo_vctr, j, k));
           else
-            this->xchng(av[0 + offset], pi<d>(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi<d>(this->rght_halo_vctr, j, k));
+            xchng(av[0 + offset], pi<d>(((this->rght_intr_vctr + off)^h)^(-1), j, k), pi<d>(this->rght_halo_vctr, j, k));
         }
         else
-          this->xchng(av[0 + offset], pi<d>(this->rght_intr_vctr + off, j, k), pi<d>(this->rght_halo_vctr, j, k));
+          xchng(av[0 + offset], pi<d>(this->rght_intr_vctr + off, j, k), pi<d>(this->rght_halo_vctr, j, k));
       }
 
       void fill_halos_sgs_tnsr(arrvec_t<arr_t> &av, const arr_t &, const arr_t &, const rng_t &j, const rng_t &k, const real_t)
@@ -219,7 +280,7 @@ namespace libmpdataxx
 
         using namespace idxperm;
         assert(halo>=1);
-        this->xchng(a, pi<d>(this->rght_edge_sclr, j, k), pi<d>(this->rght_halo_sclr.first(), j, k));
+        xchng(a, pi<d>(this->rght_edge_sclr, j, k), pi<d>(this->rght_halo_sclr.first(), j, k));
       }
 
       void avg_edge_and_halo1_sclr_cyclic(arr_t &a, const rng_t &j, const rng_t &k)
diff --git a/libmpdata++/blitz.hpp b/libmpdata++/blitz.hpp
index 606b0996..53273bde 100644
--- a/libmpdata++/blitz.hpp
+++ b/libmpdata++/blitz.hpp
@@ -112,5 +112,7 @@ namespace libmpdataxx
 #endif
     }
   };
-} // namespace libmpdataxx
 
+  // 3D blitz++ array storage with the k,i,j order for improved performance of sharedmem decomposition along j
+  const blitz::GeneralArrayStorage<3> arr3D_storage({blitz::thirdDim, blitz::firstDim, blitz::secondDim}, {true, true, true});
+} // namespace libmpdataxx
diff --git a/libmpdata++/concurr/boost_thread.hpp b/libmpdata++/concurr/boost_thread.hpp
index 83675a7c..f1d7c7a6 100644
--- a/libmpdata++/concurr/boost_thread.hpp
+++ b/libmpdata++/concurr/boost_thread.hpp
@@ -81,7 +81,7 @@ namespace libmpdataxx
 
       // ctor
       boost_thread(const typename solver_t::rt_params_t &p) :
-        parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0]))
+        parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction
       {}
 
     };
diff --git a/libmpdata++/concurr/cxx11_thread.hpp b/libmpdata++/concurr/cxx11_thread.hpp
index b89a556e..77dc47d9 100644
--- a/libmpdata++/concurr/cxx11_thread.hpp
+++ b/libmpdata++/concurr/cxx11_thread.hpp
@@ -119,7 +119,7 @@ namespace libmpdataxx
 
       // ctor
       cxx11_thread(const typename solver_t::rt_params_t &p) :
-        parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0]))
+        parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction
       {}
 
     };
diff --git a/libmpdata++/concurr/detail/concurr_common.hpp b/libmpdata++/concurr/detail/concurr_common.hpp
index 691e4ff0..9cacedd2 100644
--- a/libmpdata++/concurr/detail/concurr_common.hpp
+++ b/libmpdata++/concurr/detail/concurr_common.hpp
@@ -40,6 +40,81 @@ namespace libmpdataxx
   {
     namespace detail
     {
+      template <
+        class real_t,
+        bcond::drctn_e dir,
+        int dim,
+        int n_dims,
+        int halo,
+        class bcp_t,
+        class mem_t
+      >
+      struct bc_set_remote_impl
+      {
+        static void _(
+          bcp_t &bcp,
+          const std::unique_ptr<mem_t> &mem,
+          const int thread_rank,
+          const int thread_size
+        )
+        {
+          bcp.reset(
+            new bcond::bcond<real_t, halo, bcond::remote, dir, n_dims, dim>(
+              mem->slab(mem->grid_size[dim]),
+              mem->distmem.grid_size
+            )
+          );
+        }
+      };
+
+      template <
+        class real_t,
+        bcond::drctn_e dir,
+        int dim,
+        int halo,
+        class bcp_t,
+        class mem_t
+      >
+      struct bc_set_remote_impl<real_t, dir, dim, 3, halo, bcp_t, mem_t>
+      {
+        static void _(
+          bcp_t &bcp,
+          const std::unique_ptr<mem_t> &mem,
+          const int thread_rank,
+          const int thread_size
+        )
+        {
+          bcp.reset(
+            new bcond::bcond<real_t, halo, bcond::remote, dir, 3, dim>(
+              mem->slab(mem->grid_size[dim]),
+              mem->distmem.grid_size,
+              mem->slab(mem->grid_size[1], thread_rank, thread_size), // NOTE: we assume here remote 3d bcond is only on the edges perpendicular to x
+              thread_rank,
+              thread_size
+            )
+          );
+        }
+      };
+
+      template <
+        class real_t,
+        bcond::drctn_e dir,
+        int dim,
+        int n_dims,
+        int halo,
+        class bcp_t,
+        class mem_t
+      >
+      void bc_set_remote(
+        bcp_t &bcp,
+        const std::unique_ptr<mem_t> &mem,
+        const int thread_rank,
+        const int thread_size
+      )
+      {
+        bc_set_remote_impl<real_t, dir, dim, n_dims, halo, bcp_t, mem_t>::_(bcp, mem, thread_rank, thread_size);
+      }
+
       template<
         class solver_t_,
         bcond::bcond_e bcxl, bcond::bcond_e bcxr,
@@ -115,7 +190,9 @@ namespace libmpdataxx
           int dim
         >
         void bc_set(
-          typename solver_t::bcp_t &bcp
+          typename solver_t::bcp_t &bcp,
+          const int thread_rank  = 0, // required only by 3D remote (MPI) bcond
+          const int thread_size = 0  // required only by 3D remote (MPI) bcond
         )
         {
           // sanity check - polar coords do not work with MPI yet
@@ -123,7 +200,7 @@ namespace libmpdataxx
             throw std::runtime_error("Polar boundary conditions do not work with MPI.");
 
           // distmem overrides
-          if (type != bcond::remote && mem->distmem.size() > 1 && dim == 0)
+          if (mem->distmem.size() > 1 && dim == 0)
           {
             if (
               // distmem domain interior
@@ -133,10 +210,19 @@ namespace libmpdataxx
               // cyclic condition for distmem domain (note: will not work if a non-cyclic condition is on the other end)
               ||
               (type == bcond::cyclic)
-            ) return bc_set<bcond::remote, dir, dim>(bcp);
+            )
+            {
+              // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?)
+              bc_set_remote<real_t, dir, dim, solver_t::n_dims, solver_t::halo>(
+                bcp,
+                mem,
+                thread_rank,
+                thread_size
+              );
+              return;
+            }
           }
 
-          // bc allocation, all mpi routines called by the remote bcnd ctor are thread-safe (?)
           bcp.reset(
             new bcond::bcond<real_t, solver_t::halo, type, dir, solver_t::n_dims, dim>(
               mem->slab(mem->grid_size[dim]),
@@ -153,6 +239,7 @@ namespace libmpdataxx
         {
           typename solver_t::bcp_t bxl, bxr, shrdl, shrdr;
 
+          // NOTE: for remote bcond, thread_rank set to 0 on purpose in 1D to have propre left/right message tags 
           bc_set<bcxl, bcond::left, 0>(bxl);
           bc_set<bcxr, bcond::rght, 0>(bxr);
 
@@ -189,6 +276,7 @@ namespace libmpdataxx
             {
               typename solver_t::bcp_t bxl, bxr, byl, byr, shrdl, shrdr;
 
+              // NOTE: for remote bcond, thread_rank set to 0 on purpose in 2D to have propre left/right message tags 
               bc_set<bcxl, bcond::left, 0>(bxl);
               bc_set<bcxr, bcond::rght, 0>(bxr);
 
@@ -216,11 +304,11 @@ namespace libmpdataxx
           }
         }
 
-        // 3D version
+        // 3D version, note sharedmem in y direction!
         void init(
           const typename solver_t::rt_params_t &p,
           const std::array<rng_t, 3> &grid_size,
-          const int &n0, const int &n1 = 1, const int &n2 = 1
+          const int &n1, const int &n0 = 1, const int &n2 = 1
         ) {
           typename solver_t::bcp_t bxl, bxr, byl, byr, bzl, bzr, shrdl, shrdr;
 
@@ -231,8 +319,9 @@ namespace libmpdataxx
             {
               for (int i2 = 0; i2 < n2; ++i2)
               {
-                bc_set<bcxl, bcond::left, 0>(bxl);
-                bc_set<bcxr, bcond::rght, 0>(bxr);
+                // i1 is the local thread rank, n1 is the number of threads. These are needed by remote bcond, because only rank=0 does mpi communication
+                bc_set<bcxl, bcond::left, 0>(bxl, i1, n1);
+                bc_set<bcxr, bcond::rght, 0>(bxr, i1, n1);
 
                 bc_set<bcyl, bcond::left, 1>(byl);
                 bc_set<bcyr, bcond::rght, 1>(byr);
@@ -246,11 +335,11 @@ namespace libmpdataxx
                 algos.push_back(
                   new solver_t(
                     typename solver_t::ctor_args_t({
-                      i0,
+                      i1,
                       mem.get(),
-                      i0 == 0      ? bxl : shrdl,
-                      i0 == n0 - 1 ? bxr : shrdr,
-                      byl, byr,
+                      bxl, bxr,
+                      i1 == 0      ? byl : shrdl,
+                      i1 == n1 - 1 ? byr : shrdr,
                       bzl, bzr,
                       mem->slab(grid_size[0], i0, n0),
                       mem->slab(grid_size[1], i1, n1),
diff --git a/libmpdata++/concurr/detail/distmem.hpp b/libmpdata++/concurr/detail/distmem.hpp
index 0465b830..c8d6e0b2 100644
--- a/libmpdata++/concurr/detail/distmem.hpp
+++ b/libmpdata++/concurr/detail/distmem.hpp
@@ -120,10 +120,13 @@ namespace libmpdataxx
           // init mpi here, since distmem is constructed before hdf5
           // will be finalized in slvr_common dtor, since distmem is destructed before hdf5;
           // boost::mpi::enviro being global var caused deadlocks when freeing memory
-          if(!MPI::Is_initialized())
+          int mpi_initialized;
+          MPI_Initialized(&mpi_initialized);
+          if(!mpi_initialized)
           {
             mpi_initialized_before = false;
-            MPI::Init_thread(MPI_THREAD_MULTIPLE);
+            int th_lvl_provided;
+            MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &th_lvl_provided);
           }
           if (boost::mpi::environment::thread_level() != boost::mpi::threading::multiple)
           {
diff --git a/libmpdata++/concurr/detail/sharedmem.hpp b/libmpdata++/concurr/detail/sharedmem.hpp
index cf927d6a..39ccb7c5 100644
--- a/libmpdata++/concurr/detail/sharedmem.hpp
+++ b/libmpdata++/concurr/detail/sharedmem.hpp
@@ -29,7 +29,6 @@ namespace libmpdataxx
       >
       class sharedmem_common
       {
-        using arr_t = blitz::Array<real_t, n_dims>;
 
         static_assert(n_dims > 0, "n_dims <= 0");
         static_assert(n_tlev > 0, "n_tlev <= 0");
@@ -39,6 +38,7 @@ namespace libmpdataxx
 
         protected:
 
+        using arr_t = blitz::Array<real_t, n_dims>;
         blitz::TinyVector<int, n_dims> origin;
 
         public:
@@ -48,6 +48,11 @@ namespace libmpdataxx
         std::array<rng_t, n_dims> grid_size;
         bool panic = false; // for multi-threaded SIGTERM handling
 
+        // dimension in which sharedmem domain decomposition is done
+        // 1D and 2D - domain decomposed in 0-th dimension (x)
+        // 3D - domain decomposed in 1-st dimension (y) for better workload balance in MPI runs (MPI is decomposed in x)
+        const int shmem_decomp_dim;
+
         detail::distmem<real_t, n_dims> distmem;
 
         // TODO: these are public because used from outside in alloc - could friendship help?
@@ -84,27 +89,25 @@ namespace libmpdataxx
         // ctors
         // TODO: fill reducetmp with NaNs (or use 1-element arrvec_t - it's NaN-filled by default)
         sharedmem_common(const std::array<int, n_dims> &grid_size, const int &size)
-          : n(0), distmem(grid_size), size(size) // TODO: is n(0) needed?
+          : n(0), distmem(grid_size), size(size), shmem_decomp_dim(n_dims < 3 ? 0 : 1) // TODO: is n(0) needed?
         {
           for (int d = 0; d < n_dims; ++d)
           {
             this->grid_size[d] = slab(
               rng_t(0, grid_size[d]-1),
-              d == 0 ? distmem.rank() : 0,
+              d == 0 ? distmem.rank() : 0,          // decomposition along x, because that's MPI decomposition
               d == 0 ? distmem.size() : 1
+             // d == shmem_decomp_dim ? distmem.rank() : 0,
+             // d == shmem_decomp_dim ? distmem.size() : 1
             );
             origin[d] = this->grid_size[d].first();
           }
 
-          std::ostringstream oss;
-          oss << "grid_size[0]: " << this->grid_size[0] << " origin[0]: " << origin[0] << std::endl;
-          std::cerr << oss.str() << std::endl;
-
           if (size > grid_size[0])
             throw std::runtime_error("number of subdomains greater than number of gridpoints");
 
           if (n_dims != 1)
-            sumtmp.reset(new blitz::Array<double, 1>(this->grid_size[0]));
+            sumtmp.reset(new blitz::Array<double, 1>(this->grid_size[shmem_decomp_dim])); 
           xtmtmp.reset(new blitz::Array<real_t, 1>(size));
         }
 
@@ -113,11 +116,11 @@ namespace libmpdataxx
         {
           // doing a two-step sum to reduce numerical error
           // and make parallel results reproducible
-          for (int c = ijk[0].first(); c <= ijk[0].last(); ++c) // TODO: optimise for i.count() == 1
+          for (int c = ijk[shmem_decomp_dim].first(); c <= ijk[shmem_decomp_dim].last(); ++c) // TODO: optimise for i.count() == 1
           {
             auto slice_idx = ijk;
-            slice_idx.lbound(0) = c;
-            slice_idx.ubound(0) = c;
+            slice_idx.lbound(shmem_decomp_dim) = c;
+            slice_idx.ubound(shmem_decomp_dim) = c;
 
             if (sum_khn)
               (*sumtmp)(c) = blitz::kahan_sum(arr(slice_idx));
@@ -138,14 +141,14 @@ namespace libmpdataxx
           {
             // master thread calculates the sum from this process, stores in shared array
             if (sum_khn)
-              (*sumtmp)(grid_size[0].first())= blitz::kahan_sum(*sumtmp); // inplace?!
+              (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?!
             else
-              (*sumtmp)(grid_size[0].first())= blitz::sum(*sumtmp); // inplace?!
+              (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::sum(*sumtmp); // inplace?!
             // master thread calculates sum of sums from all processes
-            (*sumtmp)(grid_size[0].first()) = this->distmem.sum((*sumtmp)(grid_size[0].first())); // inplace?!
+            (*sumtmp)(grid_size[shmem_decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[shmem_decomp_dim].first())); // inplace?!
           }
           barrier();
-          double res = (*sumtmp)(grid_size[0].first()); // propagate the total sum to all threads of the process
+          double res = (*sumtmp)(grid_size[shmem_decomp_dim].first()); // propagate the total sum to all threads of the process
           barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread
           return res;
 #endif
@@ -156,11 +159,11 @@ namespace libmpdataxx
         {
           // doing a two-step sum to reduce numerical error
           // and make parallel results reproducible
-          for (int c = ijk[0].first(); c <= ijk[0].last(); ++c)
+          for (int c = ijk[shmem_decomp_dim].first(); c <= ijk[shmem_decomp_dim].last(); ++c)
           {
             auto slice_idx = ijk;
-            slice_idx.lbound(0) = c;
-            slice_idx.ubound(0) = c;
+            slice_idx.lbound(shmem_decomp_dim) = c;
+            slice_idx.ubound(shmem_decomp_dim) = c;
 
             if (sum_khn)
               (*sumtmp)(c) = blitz::kahan_sum(arr1(slice_idx) * arr2(slice_idx));
@@ -182,14 +185,14 @@ namespace libmpdataxx
           {
             // master thread calculates the sum from this process, stores in shared array
             if (sum_khn)
-              (*sumtmp)(grid_size[0].first())= blitz::kahan_sum(*sumtmp); // inplace?!
+              (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::kahan_sum(*sumtmp); // inplace?!
             else
-              (*sumtmp)(grid_size[0].first())= blitz::sum(*sumtmp); // inplace?!
+              (*sumtmp)(grid_size[shmem_decomp_dim].first())= blitz::sum(*sumtmp); // inplace?!
             // master thread calculates sum of sums from all processes
-            (*sumtmp)(grid_size[0].first()) = this->distmem.sum((*sumtmp)(grid_size[0].first())); // inplace?!
+            (*sumtmp)(grid_size[shmem_decomp_dim].first()) = this->distmem.sum((*sumtmp)(grid_size[shmem_decomp_dim].first())); // inplace?!
           }
           barrier();
-          double res = (*sumtmp)(grid_size[0].first()); // propagate the total sum to all threads of the process
+          double res = (*sumtmp)(grid_size[shmem_decomp_dim].first()); // propagate the total sum to all threads of the process
           barrier(); // to avoid sumtmp being overwritten by next call to sum from other thread
           return res;
 #endif
@@ -267,7 +270,7 @@ namespace libmpdataxx
         boost::ptr_vector<arr_t> tobefreed;
 
         public:
-        arr_t *never_delete(arr_t *arg)
+        virtual arr_t *never_delete(arr_t *arg)
         {
           arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData);
           ret->reindexSelf(arg->base());
@@ -277,7 +280,7 @@ namespace libmpdataxx
         arr_t *old(arr_t *arg)
         {
           tobefreed.push_back(arg);
-          arr_t *ret = never_delete(arg);
+          arr_t *ret = this->never_delete(arg);
           return ret;
         }
 
@@ -575,10 +578,18 @@ namespace libmpdataxx
       class sharedmem<real_t, 3, n_tlev> : public sharedmem_common<real_t, 3, n_tlev>
       {
         using parent_t = sharedmem_common<real_t, 3, n_tlev>;
+        using arr_t = typename parent_t::arr_t;
         using parent_t::parent_t; // inheriting ctors
 
         public:
 
+        virtual arr_t *never_delete(arr_t *arg) override
+        {
+          arr_t *ret = new arr_t(arg->dataFirst(), arg->shape(), blitz::neverDeleteData, blitz::GeneralArrayStorage<3>(arg->ordering(), {true, true, true}));
+          ret->reindexSelf(arg->base());
+          return ret;
+        }
+
         blitz::Array<real_t, 3> advectee(int e = 0)
         {
           assert(this->n < n_tlev);
diff --git a/libmpdata++/concurr/openmp.hpp b/libmpdata++/concurr/openmp.hpp
index 24f7c78b..d33cda98 100644
--- a/libmpdata++/concurr/openmp.hpp
+++ b/libmpdata++/concurr/openmp.hpp
@@ -77,7 +77,7 @@ namespace libmpdataxx
 
       // ctor
       openmp(const typename solver_t::rt_params_t &p) :
-        parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[0]))
+        parent_t(p, new mem_t(p.grid_size), mem_t::size(p.grid_size[solver_t::n_dims < 3 ? 0 : 1])) // note 3D domain decomposition in y direction
       {}
 
     };
diff --git a/libmpdata++/output/hdf5.hpp b/libmpdata++/output/hdf5.hpp
index 553cedef..89c8013c 100644
--- a/libmpdata++/output/hdf5.hpp
+++ b/libmpdata++/output/hdf5.hpp
@@ -273,8 +273,10 @@ namespace libmpdataxx
           }
           case 3:
           {
-            typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], zro);
-            contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); // create a copy that is contiguous
+            // create a copy that is contiguous and has the C-style (kji) storage order as required by HDF5
+            typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], zro); 
+            contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], zro);
+
             dset.write(contiguous_arr.data(), flttype_solver, H5::DataSpace(parent_t::n_dims, srfcshape.data()), space, dxpl_id);
             break;
           }
@@ -304,7 +306,10 @@ namespace libmpdataxx
           }
           case 3:
           {
-            typename solver_t::arr_t contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]).copy(); // create a copy that is contiguous
+            // create a copy that is contiguous and has the C-style (kji) storage order as required by HDF5
+            typename solver_t::arr_t contiguous_arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]); 
+            contiguous_arr = arr(this->mem->grid_size[0], this->mem->grid_size[1], this->mem->grid_size[2]);
+
             dset.write(contiguous_arr.data(), flttype_solver, H5::DataSpace(parent_t::n_dims, shape.data()), space, dxpl_id);
             break;
           }
@@ -312,7 +317,7 @@ namespace libmpdataxx
         };
       }
 
-      // data is assumed to be contiguous and in the same layout as hdf variable
+      // data is assumed to be contiguous and in the same layout as hdf variable and in the C-style storage order
       void record_aux_hlpr(const std::string &name, typename solver_t::real_t *data, H5::H5File hdf)
       {
         assert(this->rank == 0);
@@ -329,6 +334,7 @@ namespace libmpdataxx
         aux.write(data, flttype_solver, H5::DataSpace(parent_t::n_dims, shape.data()), space, dxpl_id);
       }
 
+      // data is assumed to be contiguous and in the same layout as hdf variable and in the C-style storage order
       void record_aux(const std::string &name, typename solver_t::real_t *data)
       {
         record_aux_hlpr(name, data, *hdfp);
diff --git a/libmpdata++/solvers/detail/mpdata_osc_2d.hpp b/libmpdata++/solvers/detail/mpdata_osc_2d.hpp
index f86e9927..70b53d60 100644
--- a/libmpdata++/solvers/detail/mpdata_osc_2d.hpp
+++ b/libmpdata++/solvers/detail/mpdata_osc_2d.hpp
@@ -230,7 +230,7 @@ namespace libmpdataxx
           const typename parent_t::rt_params_t &p
         ) :
           parent_t(args, p),
-          im(args.i.first() - 1, args.i.last()),
+          im(this->rank == 0 ? args.i.first() - 1 : args.i.first(), args.i.last()),
           jm(args.j.first() - 1, args.j.last())
         { }
       };
diff --git a/libmpdata++/solvers/detail/mpdata_osc_3d.hpp b/libmpdata++/solvers/detail/mpdata_osc_3d.hpp
index 8c7901e8..a2eab480 100644
--- a/libmpdata++/solvers/detail/mpdata_osc_3d.hpp
+++ b/libmpdata++/solvers/detail/mpdata_osc_3d.hpp
@@ -254,7 +254,7 @@ namespace libmpdataxx
         ) :
           parent_t(args, p),
           im(args.i.first() - 1, args.i.last()),
-          jm(args.j.first() - 1, args.j.last()),
+          jm(this->rank == 0 ? args.j.first() - 1 : args.j.first(), args.j.last()),
           km(args.k.first() - 1, args.k.last())
         { }
       };
diff --git a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp
index be1161df..07d15f50 100644
--- a/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp
+++ b/libmpdata++/solvers/detail/mpdata_rhs_vip_prs_sgs_common.hpp
@@ -57,7 +57,7 @@ namespace libmpdataxx
         // ijkm with non-overlapping ranges
         idx_t<ct_params_t::n_dims> ijkm_sep;
 
-        // like ijk, but with range in x direction extended by 1 to the left for rank=0 for MPI compliance.
+        // like ijk, but with range in x direction extended by 1 to the left for MPI compliance.
         // MPI requires that vector between two process domains is calculated by the process to the right of it  (c.f. remote_2d.hpp fill_halos_vctr_alng)
         // TODO: change MPI logic to assume that it is calculated by the process to the left? then, ijk_vec would not be needed(?)
         std::array<rng_t, ct_params_t::n_dims> ijk_vec;
@@ -202,14 +202,26 @@ namespace libmpdataxx
 
             ijk_vec[d] = rng_t(this->ijk[d].first(),     this->ijk[d].last());
           }
-          if (this->rank == 0)
+
+          if(ct_params_t::n_dims < 3 && this->rank == 0 // 1D and 2D - sharedmem in x direction
+             ||
+             ct_params_t::n_dims == 3                   // 3D - sharedmem in y direction
+          )
             ijk_vec[0] = rng_t(this->ijk[0].first() - 1, this->ijk[0].last());
 
           ijkm_sep = ijkm;
           if (this->rank > 0)
           {
-            ijkm_sep.lbound()(0) = this->ijk[0].first();
-            ijkm_sep.ubound()(0) = this->ijk[0].last();
+            if(ct_params_t::n_dims < 3)
+            {
+              ijkm_sep.lbound()(0) = this->ijk[0].first();
+              ijkm_sep.ubound()(0) = this->ijk[0].last();
+            }
+            else
+            {
+              ijkm_sep.lbound()(1) = this->ijk[1].first();
+              ijkm_sep.ubound()(1) = this->ijk[1].last();
+            }
           }
         }
 
diff --git a/libmpdata++/solvers/detail/solver_2d.hpp b/libmpdata++/solvers/detail/solver_2d.hpp
index 2a95b563..c556e5cc 100644
--- a/libmpdata++/solvers/detail/solver_2d.hpp
+++ b/libmpdata++/solvers/detail/solver_2d.hpp
@@ -170,6 +170,7 @@ namespace libmpdataxx
           const int &sign
         ) final
         {
+          this->mem->barrier();
           for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], sign);
           for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[0], sign);
           this->mem->barrier();
@@ -180,6 +181,7 @@ namespace libmpdataxx
           const idx_t<2> &range_ijk
         ) final
         {
+          this->mem->barrier();
           for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1]);
           for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[0]);
           this->mem->barrier();
diff --git a/libmpdata++/solvers/detail/solver_3d.hpp b/libmpdata++/solvers/detail/solver_3d.hpp
index b7a3825a..e2b0a3be 100644
--- a/libmpdata++/solvers/detail/solver_3d.hpp
+++ b/libmpdata++/solvers/detail/solver_3d.hpp
@@ -26,6 +26,11 @@ namespace libmpdataxx
       {
         using parent_t = solver_common<ct_params_t, n_tlev, minhalo>;
 
+        void barrier_if_single_threaded_bc0()
+        {
+          if(this->bcs[0].at(0)->single_threaded || this->bcs[0].at(1)->single_threaded) this->mem->barrier();
+        }
+
         public:
 
         using real_t = typename ct_params_t::real_t;
@@ -43,11 +48,12 @@ namespace libmpdataxx
                        const bool deriv = false
         ) final // for a given array
         {
-          const auto range_ijk_0__ext = this->extend_range(range_ijk[0], ext);
+          const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext);
           this->mem->barrier();
-          for (auto &bc : this->bcs[0]) bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv);
-          for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk_0__ext, deriv);
-          for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk_0__ext, range_ijk[1]^ext, deriv);
+          for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_sclr(arr, range_ijk[1]^ext, range_ijk[2]^ext, deriv) : bc->fill_halos_sclr(arr, range_ijk_1__ext, range_ijk[2]^ext, deriv);
+          barrier_if_single_threaded_bc0();
+          for (auto &bc : this->bcs[1]) bc->fill_halos_sclr(arr, range_ijk[2]^ext, range_ijk[0]^ext, deriv);
+          for (auto &bc : this->bcs[2]) bc->fill_halos_sclr(arr, range_ijk[0]^ext, range_ijk_1__ext, deriv);
           this->mem->barrier();
         }
         void xchng(int e) final
@@ -60,25 +66,31 @@ namespace libmpdataxx
           this->mem->barrier();
           if (!cyclic)
           {
-            for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad);
+            // order matters! see open_3d bcond
             for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng(arrvec, k, i, ad);
             for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng(arrvec, i, j, ad);
+            barrier_if_single_threaded_bc0();
+            for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng(arrvec, j, k, ad);
           }
           else
           {
             for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_alng_cyclic(arrvec, j, k, ad);
+            barrier_if_single_threaded_bc0();
             for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_alng_cyclic(arrvec, k, i, ad);
             for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_alng_cyclic(arrvec, i, j, ad);
           }
           this->mem->barrier();
+
         }
 
         virtual void xchng_flux(arrvec_t<typename parent_t::arr_t> &arrvec) final
         {
           this->mem->barrier();
           for (auto &bc : this->bcs[0]) bc->fill_halos_flux(arrvec, j, k);
+          barrier_if_single_threaded_bc0();
           for (auto &bc : this->bcs[1]) bc->fill_halos_flux(arrvec, k, i);
           for (auto &bc : this->bcs[2]) bc->fill_halos_flux(arrvec, i, j);
+          this->mem->barrier();
         }
 
         virtual void xchng_sgs_div(
@@ -89,6 +101,7 @@ namespace libmpdataxx
           this->mem->barrier();
           for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_div(arr, range_ijk[0], range_ijk[1]);
           for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_div(arr, range_ijk[2]^h, range_ijk[0]);
+          barrier_if_single_threaded_bc0(); // to make sure that all threads have filled range_ijk[2]^h before it is used to fill along x 
           for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_div(arr, range_ijk[1], range_ijk[2]^h);
           this->mem->barrier();
         }
@@ -100,6 +113,7 @@ namespace libmpdataxx
         {
           this->mem->barrier();
           for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_vctr(av, b, range_ijk[1], range_ijk[2]);
+          barrier_if_single_threaded_bc0();
           for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_vctr(av, b, range_ijk[2], range_ijk[0]);
           for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_vctr(av, b, range_ijk[0], range_ijk[1]);
           this->mem->barrier();
@@ -113,6 +127,7 @@ namespace libmpdataxx
         {
           this->mem->barrier();
           for (auto &bc : this->bcs[0]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[1], range_ijk[2], this->dijk[0]);
+          barrier_if_single_threaded_bc0();
           for (auto &bc : this->bcs[1]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[2], range_ijk[0], this->dijk[1]);
           for (auto &bc : this->bcs[2]) bc->fill_halos_sgs_tnsr(av, w, vip_div, range_ijk[0], range_ijk[1], this->dijk[2]);
           this->mem->barrier();
@@ -131,6 +146,7 @@ namespace libmpdataxx
             bc->fill_halos_sgs_vctr(av, bv[0], range_ijkm[1], range_ijk[2]^1, 3);
             bc->fill_halos_sgs_vctr(av, bv[1], range_ijk[1]^1, range_ijkm[2], 4);
           }
+          barrier_if_single_threaded_bc0();
 
           for (auto &bc : this->bcs[1])
           {
@@ -154,8 +170,8 @@ namespace libmpdataxx
         ) final
         {
           this->mem->barrier();
-          const auto range_ijk_0__ext_h = this->extend_range(range_ijk[0], ext, h);
-          const auto range_ijk_0__ext_1 = this->extend_range(range_ijk[0], ext, 1);
+          const auto range_ijk_1__ext_h = this->extend_range(range_ijk[1], ext, h);
+          const auto range_ijk_1__ext_1 = this->extend_range(range_ijk[1], ext, 1);
           if (!cyclic)
           {
             for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h);
@@ -169,24 +185,36 @@ namespace libmpdataxx
               this->mem->barrier();
             }
 
-            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk_0__ext_h, range_ijk[1]^ext^1);
+            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1);
+
+            barrier_if_single_threaded_bc0();
+            for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1) : bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1);
+            barrier_if_single_threaded_bc0();
 
-            for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1);
-            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk_0__ext_1, range_ijk[1]^ext^h);
+            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h);
 
-            for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h);
-            for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk_0__ext_1);
+            barrier_if_single_threaded_bc0();
+            for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h);
+            barrier_if_single_threaded_bc0();
+
+            for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1);
           }
           else
           {
-            for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk_0__ext_h);
-            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk_0__ext_h, range_ijk[1]^ext^1);
+            for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[2]^ext^1, range_ijk[0]^ext^h);
+            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[0], range_ijk[0]^ext^h, range_ijk_1__ext_1);
+
+            barrier_if_single_threaded_bc0();
+            for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1) : bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_1__ext_h, range_ijk[2]^ext^1);
+            barrier_if_single_threaded_bc0();
 
-            for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[1]^ext^h, range_ijk[2]^ext^1);
-            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk_0__ext_1, range_ijk[1]^ext^h);
+            for (auto &bc : this->bcs[2]) bc->fill_halos_vctr_nrml_cyclic(arrvec[1], range_ijk[0]^ext^1, range_ijk_1__ext_h);
 
-            for (auto &bc : this->bcs[0]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h);
-            for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk_0__ext_1);
+            barrier_if_single_threaded_bc0();
+            for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[1]^ext^1, range_ijk[2]^ext^h) : bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk_1__ext_1, range_ijk[2]^ext^h);
+            barrier_if_single_threaded_bc0();
+
+            for (auto &bc : this->bcs[1]) bc->fill_halos_vctr_nrml_cyclic(arrvec[2], range_ijk[2]^ext^h, range_ijk[0]^ext^1);
           }
           this->mem->barrier();
         }
@@ -197,11 +225,12 @@ namespace libmpdataxx
           const int ext = 0
         ) final
         {
-          const auto range_ijk_0__ext = this->extend_range(range_ijk[0], ext);
+          const auto range_ijk_1__ext = this->extend_range(range_ijk[1], ext);
           this->mem->barrier();
-          for (auto &bc : this->bcs[0]) bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext);
-          for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk_0__ext);
-          for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk_0__ext, range_ijk[1]^ext);
+          for (auto &bc : this->bcs[0]) bc->single_threaded ? bc->fill_halos_pres(arr, range_ijk[1]^ext, range_ijk[2]^ext) : bc->fill_halos_pres(arr, range_ijk_1__ext, range_ijk[2]^ext);
+          barrier_if_single_threaded_bc0();
+          for (auto &bc : this->bcs[1]) bc->fill_halos_pres(arr, range_ijk[2]^ext, range_ijk[0]^ext);
+          for (auto &bc : this->bcs[2]) bc->fill_halos_pres(arr, range_ijk[0]^ext, range_ijk_1__ext);
           this->mem->barrier();
         }
 
@@ -211,7 +240,9 @@ namespace libmpdataxx
           const int &sign
         ) final
         {
+          this->mem->barrier();
           for (auto &bc : this->bcs[0]) bc->set_edge_pres(av[0], range_ijk[1], range_ijk[2], sign);
+          barrier_if_single_threaded_bc0();
           for (auto &bc : this->bcs[1]) bc->set_edge_pres(av[1], range_ijk[2], range_ijk[0], sign);
           for (auto &bc : this->bcs[2]) bc->set_edge_pres(av[2], range_ijk[0], range_ijk[1], sign);
           this->mem->barrier();
@@ -222,7 +253,9 @@ namespace libmpdataxx
           const idx_t<3> &range_ijk
         ) final
         {
+          this->mem->barrier();
           for (auto &bc : this->bcs[0]) bc->save_edge_vel(av[0], range_ijk[1], range_ijk[2]);
+          barrier_if_single_threaded_bc0();
           for (auto &bc : this->bcs[1]) bc->save_edge_vel(av[1], range_ijk[2], range_ijk[0]);
           for (auto &bc : this->bcs[2]) bc->save_edge_vel(av[2], range_ijk[0], range_ijk[1]);
           this->mem->barrier();
@@ -233,14 +266,18 @@ namespace libmpdataxx
         ) final
         {
           this->mem->barrier();
+
           for (auto &bc : this->bcs[0]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[1], range_ijk[2]);
+          barrier_if_single_threaded_bc0();
           for (auto &bc : this->bcs[0]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[1], range_ijk[2]);
+          barrier_if_single_threaded_bc0();
 
           for (auto &bc : this->bcs[1]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[2], range_ijk[0]);
           for (auto &bc : this->bcs[1]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[2], range_ijk[0]);
 
           for (auto &bc : this->bcs[2]) bc->copy_edge_sclr_to_halo1_cyclic(arr, range_ijk[0], range_ijk[1]);
           for (auto &bc : this->bcs[2]) bc->avg_edge_and_halo1_sclr_cyclic(arr, range_ijk[0], range_ijk[1]);
+
           this->mem->barrier();
         }
 
@@ -339,6 +376,7 @@ namespace libmpdataxx
           this->set_bcs(2, args.bczl, args.bczr);
         }
 
+
         public:
 
         static void alloc(
@@ -353,24 +391,28 @@ namespace libmpdataxx
               mem->psi[e].push_back(mem->old(new typename parent_t::arr_t(
                 parent_t::rng_sclr(mem->grid_size[0]),
                 parent_t::rng_sclr(mem->grid_size[1]),
-                parent_t::rng_sclr(mem->grid_size[2])
+                parent_t::rng_sclr(mem->grid_size[2]),
+                arr3D_storage
               )));
 
           // Courant field components (Arakawa-C grid)
           mem->GC.push_back(mem->old(new typename parent_t::arr_t(
             parent_t::rng_vctr(mem->grid_size[0]),
             parent_t::rng_sclr(mem->grid_size[1]),
-            parent_t::rng_sclr(mem->grid_size[2])
+            parent_t::rng_sclr(mem->grid_size[2]),
+            arr3D_storage
           )));
           mem->GC.push_back(mem->old(new typename parent_t::arr_t(
             parent_t::rng_sclr(mem->grid_size[0]),
             parent_t::rng_vctr(mem->grid_size[1]),
-            parent_t::rng_sclr(mem->grid_size[2])
+            parent_t::rng_sclr(mem->grid_size[2]),
+            arr3D_storage
           )));
           mem->GC.push_back(mem->old(new typename parent_t::arr_t(
             parent_t::rng_sclr(mem->grid_size[0]),
             parent_t::rng_sclr(mem->grid_size[1]),
-            parent_t::rng_vctr(mem->grid_size[2])
+            parent_t::rng_vctr(mem->grid_size[2]),
+            arr3D_storage
           )));
 
           // fully third-order accurate mpdata needs also time derivatives of
@@ -382,33 +424,39 @@ namespace libmpdataxx
             mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t(
               parent_t::rng_vctr(mem->grid_size[0]),
               parent_t::rng_sclr(mem->grid_size[1]),
-              parent_t::rng_sclr(mem->grid_size[2])
+              parent_t::rng_sclr(mem->grid_size[2]),
+              arr3D_storage
             )));
             mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t(
               parent_t::rng_sclr(mem->grid_size[0]),
               parent_t::rng_vctr(mem->grid_size[1]),
-              parent_t::rng_sclr(mem->grid_size[2])
+              parent_t::rng_sclr(mem->grid_size[2]),
+              arr3D_storage
             )));
             mem->ndt_GC.push_back(mem->old(new typename parent_t::arr_t(
               parent_t::rng_sclr(mem->grid_size[0]),
               parent_t::rng_sclr(mem->grid_size[1]),
-              parent_t::rng_vctr(mem->grid_size[2])
+              parent_t::rng_vctr(mem->grid_size[2]),
+              arr3D_storage
             )));
 
             mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t(
               parent_t::rng_vctr(mem->grid_size[0]),
               parent_t::rng_sclr(mem->grid_size[1]),
-              parent_t::rng_sclr(mem->grid_size[2])
+              parent_t::rng_sclr(mem->grid_size[2]),
+              arr3D_storage
             )));
             mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t(
               parent_t::rng_sclr(mem->grid_size[0]),
               parent_t::rng_vctr(mem->grid_size[1]),
-              parent_t::rng_sclr(mem->grid_size[2])
+              parent_t::rng_sclr(mem->grid_size[2]),
+              arr3D_storage
             )));
             mem->ndtt_GC.push_back(mem->old(new typename parent_t::arr_t(
               parent_t::rng_sclr(mem->grid_size[0]),
               parent_t::rng_sclr(mem->grid_size[1]),
-              parent_t::rng_vctr(mem->grid_size[2])
+              parent_t::rng_vctr(mem->grid_size[2]),
+              arr3D_storage
             )));
           }
 
@@ -417,7 +465,8 @@ namespace libmpdataxx
             mem->G.reset(mem->old(new typename parent_t::arr_t(
                     parent_t::rng_sclr(mem->grid_size[0]),
                     parent_t::rng_sclr(mem->grid_size[1]),
-                    parent_t::rng_sclr(mem->grid_size[2])
+                    parent_t::rng_sclr(mem->grid_size[2]),
+                    arr3D_storage
             )));
 
           // allocate Kahan summation temporary vars
@@ -426,7 +475,8 @@ namespace libmpdataxx
               mem->khn_tmp.push_back(mem->old(new typename parent_t::arr_t(
                 parent_t::rng_sclr(mem->grid_size[0]),
                 parent_t::rng_sclr(mem->grid_size[1]),
-                parent_t::rng_sclr(mem->grid_size[2])
+                parent_t::rng_sclr(mem->grid_size[2]),
+                arr3D_storage
               )));
           // courant field
           alloc_tmp_sclr(mem, __FILE__, 1);
@@ -449,7 +499,8 @@ namespace libmpdataxx
               stgr[n][1] ? parent_t::rng_vctr(mem->grid_size[1]) : parent_t::rng_sclr(mem->grid_size[1]),
               srfc ? rng_t(0, 0) :
                 stgr[n][2] ? parent_t::rng_vctr(mem->grid_size[2]) :
-                  parent_t::rng_sclr(mem->grid_size[2])
+                  parent_t::rng_sclr(mem->grid_size[2]),
+              arr3D_storage
             )));
           }
         }
@@ -479,7 +530,8 @@ namespace libmpdataxx
             mem->tmp[__file__].back().push_back(mem->old(new typename parent_t::arr_t(
               parent_t::rng_sclr(mem->grid_size[0]),
               parent_t::rng_sclr(mem->grid_size[1]),
-              srfc ? rng_t(0, 0) : parent_t::rng_sclr(mem->grid_size[2])
+              srfc ? rng_t(0, 0) : parent_t::rng_sclr(mem->grid_size[2]),
+              arr3D_storage
             )));
         }
       };
diff --git a/libmpdata++/solvers/detail/solver_common.hpp b/libmpdata++/solvers/detail/solver_common.hpp
index c70cb54e..5aab5f07 100644
--- a/libmpdata++/solvers/detail/solver_common.hpp
+++ b/libmpdata++/solvers/detail/solver_common.hpp
@@ -133,7 +133,7 @@ namespace libmpdataxx
           scale(e, -ct_params_t::hint_scale(e));
         }
 
-        // thread-aware range extension
+        // thread-aware range extension, messes range guessing in remote_3d bcond
         template <class n_t>
         rng_t extend_range(const rng_t &r, const n_t n) const
         {
@@ -255,7 +255,7 @@ namespace libmpdataxx
           // TODO: MPI standard requires that the same thread that called mpi_init
           // calls mpi_finalize, we don't ensure it
           if(!libmpdataxx::concurr::detail::mpi_initialized_before && rank==0)
-            MPI::Finalize();
+            MPI_Finalize();
 #endif
 #if !defined(NDEBUG)
           assert(hook_ante_step_called && "any overriding hook_ante_step() must call parent_t::hook_ante_step()");