LLNL · bmhan12 · May 22, 2024 · Jan 9, 2024 · Jan 10, 2024 · Jan 10, 2024
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -32,6 +32,7 @@ The Axom project release numbers follow [Semantic Versioning](http://semver.org/
 - Primal: Makes several primitive methods available in device code
 
 ### Changed
+- Set default Umpire allocator id to device instead of unified for CUDA and HIP execution policies.
 - Upgrades `vcpkg` usage for axom's automated Windows builds to its
   [2024.03.19 release](https://github.com/microsoft/vcpkg/releases/tag/2024.03.19).
   Also updates vcpkg port versions for axom dependencies. Temporarily removes `umpire`

diff --git a/src/axom/core/examples/core_acceleration.cpp b/src/axom/core/examples/core_acceleration.cpp
@@ -182,8 +182,7 @@ void demoAxomExecution()
   //_gpu_atomic_start
   using atomic_pol = typename axom::execution_space<ExecSpace>::atomic_policy;
 
-  int *sum =
-    axom::allocate<int>(1, axom::execution_space<ExecSpace>::allocatorID());
+  int *sum = axom::allocate<int>(1, allocator_id);
   *sum = 0;
 
   // Increment sum 100 times
@@ -193,6 +192,8 @@ void demoAxomExecution()
 
   std::cout << "\nTotal Atomic Sum (" << axom::execution_space<ExecSpace>::name()
             << ") :" << sum[0] << std::endl;
+
+  axom::deallocate(sum);
   //_gpu_atomic_end
 
 #endif

diff --git a/src/axom/core/execution/internal/cuda_exec.hpp b/src/axom/core/execution/internal/cuda_exec.hpp
@@ -63,7 +63,7 @@ struct execution_space<CUDA_EXEC<BLOCK_SIZE, SYNCHRONOUS>>
   static constexpr char* name() noexcept { return (char*)"[CUDA_EXEC]"; }
   static int allocatorID() noexcept
   {
-    return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+    return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   }
 };
 
@@ -93,7 +93,7 @@ struct execution_space<CUDA_EXEC<BLOCK_SIZE, ASYNC>>
   }
   static int allocatorID() noexcept
   {
-    return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+    return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   }
 };
 }  // namespace axom

diff --git a/src/axom/core/execution/internal/hip_exec.hpp b/src/axom/core/execution/internal/hip_exec.hpp
@@ -61,7 +61,7 @@ struct execution_space<HIP_EXEC<BLOCK_SIZE, SYNCHRONOUS>>
   static constexpr char* name() noexcept { return (char*)"[HIP_EXEC]"; }
   static int allocatorID() noexcept
   {
-    return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+    return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   }
 };
 
@@ -88,7 +88,7 @@ struct execution_space<HIP_EXEC<BLOCK_SIZE, ASYNC>>
   static constexpr char* name() noexcept { return (char*)"[HIP_EXEC] (async)"; }
   static int allocatorID() noexcept
   {
-    return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+    return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   }
 };
 }  // namespace axom

diff --git a/src/axom/core/tests/core_array_for_all.hpp b/src/axom/core/tests/core_array_for_all.hpp
@@ -1204,6 +1204,12 @@ AXOM_TYPED_TEST(core_array_for_all, device_insert)
     typename TestFixture::template DynamicTArray<DynamicArray>;
 
   int kernelAllocID = axom::execution_space<ExecSpace>::allocatorID();
+#if defined(AXOM_USE_GPU) && defined(AXOM_USE_UMPIRE)
+  // Use unified memory for frequent movement between device operations
+  // and value checking on host
+  kernelAllocID = axom::getUmpireResourceAllocatorID(
+    umpire::resource::MemoryResourceType::Unified);
+#endif
 
   constexpr axom::IndexType N = 374;
 

diff --git a/src/axom/core/tests/core_execution_for_all.hpp b/src/axom/core/tests/core_execution_for_all.hpp
@@ -32,12 +32,12 @@ void check_for_all()
   constexpr int VALUE_2 = 42;
   constexpr int N = 256;
 
-  // STEP 1: set default allocator for the execution space
-  const int currentAllocatorID = axom::getDefaultAllocatorID();
-  axom::setDefaultAllocator(axom::execution_space<ExecSpace>::allocatorID());
+  // STEP 1: set allocators for the execution spaces
+  const int hostID = axom::execution_space<axom::SEQ_EXEC>::allocatorID();
+  const int allocID = axom::execution_space<ExecSpace>::allocatorID();
 
   // STEP 0: allocate buffer
-  int* a = axom::allocate<int>(N);
+  int* a = axom::allocate<int>(N, allocID);
 
   // STEP 1: initialize to VALUE_1
   axom::for_all<ExecSpace>(
@@ -50,9 +50,12 @@ void check_for_all()
   }
 
   // STEP 2: check array
+  int* a_host = axom::allocate<int>(N, hostID);
+  axom::copy(a_host, a, N * sizeof(int));
+
   for(int i = 0; i < N; ++i)
   {
-    EXPECT_EQ(a[i], VALUE_1);
+    EXPECT_EQ(a_host[i], VALUE_1);
   }
 
   // STEP 3: add VALUE_2 to all entries resulting to zero
@@ -67,15 +70,16 @@ void check_for_all()
   }
 
   // STEP 4: check result
+  axom::copy(a_host, a, N * sizeof(int));
+
   for(int i = 0; i < N; ++i)
   {
-    EXPECT_EQ(a[i], 0);
+    EXPECT_EQ(a_host[i], 0);
   }
 
   // STEP 5: cleanup
   axom::deallocate(a);
-
-  axom::setDefaultAllocator(currentAllocatorID);
+  axom::deallocate(a_host);
 }
 
 } /* end anonymous namespace */

diff --git a/src/axom/core/tests/core_execution_space.hpp b/src/axom/core/tests/core_execution_space.hpp
@@ -183,8 +183,7 @@ TEST(core_execution_space, check_cuda_exec)
   constexpr bool IS_ASYNC = false;
   constexpr bool ON_DEVICE = true;
 
-  int allocator_id =
-    axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+  int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   check_execution_mappings<axom::CUDA_EXEC<BLOCK_SIZE>,
                            RAJA::cuda_exec<BLOCK_SIZE>,
                            RAJA::cuda_reduce,
@@ -204,8 +203,7 @@ TEST(core_execution_space, check_cuda_exec_async)
   constexpr bool IS_ASYNC = true;
   constexpr bool ON_DEVICE = true;
 
-  int allocator_id =
-    axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+  int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   check_execution_mappings<axom::CUDA_EXEC<BLOCK_SIZE, axom::ASYNC>,
                            RAJA::cuda_exec_async<BLOCK_SIZE>,
                            RAJA::cuda_reduce,
@@ -228,8 +226,7 @@ TEST(core_execution_space, check_hip_exec)
   constexpr bool IS_ASYNC = false;
   constexpr bool ON_DEVICE = true;
 
-  int allocator_id =
-    axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+  int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   check_execution_mappings<axom::HIP_EXEC<BLOCK_SIZE>,
                            RAJA::hip_exec<BLOCK_SIZE>,
                            RAJA::hip_reduce,
@@ -249,8 +246,7 @@ TEST(core_execution_space, check_hip_exec_async)
   constexpr bool IS_ASYNC = true;
   constexpr bool ON_DEVICE = true;
 
-  int allocator_id =
-    axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
+  int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
   check_execution_mappings<axom::HIP_EXEC<BLOCK_SIZE, axom::ASYNC>,
                            RAJA::hip_exec_async<BLOCK_SIZE>,
                            RAJA::hip_reduce,

diff --git a/src/axom/mint/tests/mint_execution_cell_traversals.cpp b/src/axom/mint/tests/mint_execution_cell_traversals.cpp
@@ -30,6 +30,22 @@ namespace mint
 //------------------------------------------------------------------------------
 namespace
 {
+#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) &&   \
+  ((defined(AXOM_USE_CUDA) && defined(RAJA_ENABLE_CUDA)) || \
+   (defined(AXOM_USE_HIP) && defined(RAJA_ENABLE_HIP)))
+
+int set_um_memory_return_previous_allocator()
+{
+  // Use unified memory
+  const int exec_space_id = axom::getUmpireResourceAllocatorID(
+    umpire::resource::MemoryResourceType::Unified);
+  const int prev_allocator = axom::getDefaultAllocatorID();
+  axom::setDefaultAllocator(exec_space_id);
+  return prev_allocator;
+}
+
+#endif
+
 template <typename ExecPolicy, int MeshType, int Topology = SINGLE_SHAPE>
 void check_for_all_cells_idx(int dimension)
 {
@@ -378,9 +394,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_nodeids)
 
     using cuda_exec = axom::CUDA_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cell_nodes<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cell_nodes<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
@@ -396,9 +410,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_nodeids)
 
     using hip_exec = axom::HIP_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cell_nodes<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cell_nodes<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);
@@ -441,9 +453,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_coords)
 
     using cuda_exec = axom::CUDA_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cell_coords<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cell_coords<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
@@ -459,9 +469,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_coords)
 
     using hip_exec = axom::HIP_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cell_coords<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cell_coords<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);
@@ -504,9 +512,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_faceids)
 
     using cuda_exec = axom::CUDA_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cell_faces<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cell_faces<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
@@ -522,9 +528,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_faceids)
 
     using hip_exec = axom::HIP_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cell_faces<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cell_faces<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);
@@ -561,9 +565,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ij)
 
   using cuda_exec = axom::CUDA_EXEC<512>;
 
-  const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
-  const int prev_allocator = axom::getDefaultAllocatorID();
-  axom::setDefaultAllocator(exec_space_id);
+  const int prev_allocator = set_um_memory_return_previous_allocator();
 
   check_for_all_cells_ij<cuda_exec, STRUCTURED_UNIFORM_MESH>();
   check_for_all_cells_ij<cuda_exec, STRUCTURED_CURVILINEAR_MESH>();
@@ -577,9 +579,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ij)
 
   using hip_exec = axom::HIP_EXEC<512>;
 
-  const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
-  const int prev_allocator = axom::getDefaultAllocatorID();
-  axom::setDefaultAllocator(exec_space_id);
+  const int prev_allocator = set_um_memory_return_previous_allocator();
 
   check_for_all_cells_ij<hip_exec, STRUCTURED_UNIFORM_MESH>();
   check_for_all_cells_ij<hip_exec, STRUCTURED_CURVILINEAR_MESH>();
@@ -612,9 +612,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ijk)
 
   using cuda_exec = axom::CUDA_EXEC<512>;
 
-  const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
-  const int prev_allocator = axom::getDefaultAllocatorID();
-  axom::setDefaultAllocator(exec_space_id);
+  const int prev_allocator = set_um_memory_return_previous_allocator();
 
   check_for_all_cells_ijk<cuda_exec, STRUCTURED_UNIFORM_MESH>();
   check_for_all_cells_ijk<cuda_exec, STRUCTURED_CURVILINEAR_MESH>();
@@ -628,9 +626,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ijk)
 
   using hip_exec = axom::HIP_EXEC<512>;
 
-  const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
-  const int prev_allocator = axom::getDefaultAllocatorID();
-  axom::setDefaultAllocator(exec_space_id);
+  const int prev_allocator = set_um_memory_return_previous_allocator();
 
   check_for_all_cells_ijk<hip_exec, STRUCTURED_UNIFORM_MESH>();
   check_for_all_cells_ijk<hip_exec, STRUCTURED_CURVILINEAR_MESH>();
@@ -670,9 +666,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_index)
 
     using cuda_exec = axom::CUDA_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cells_idx<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cells_idx<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
@@ -688,9 +682,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_index)
 
     using hip_exec = axom::HIP_EXEC<512>;
 
-    const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
-    const int prev_allocator = axom::getDefaultAllocatorID();
-    axom::setDefaultAllocator(exec_space_id);
+    const int prev_allocator = set_um_memory_return_previous_allocator();
 
     check_for_all_cells_idx<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
     check_for_all_cells_idx<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);