Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Set default allocator id to device instead of unified for CUDA and HIP execution policies #1316

Merged
merged 35 commits into from
May 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
c29ee0c
Set default allocator to Device instead of Unified for CUDA and HIP
bmhan12 Jan 9, 2024
2df69e7
Fix core_serial_test
bmhan12 Jan 10, 2024
689ac01
Fix core acceleration example
bmhan12 Jan 10, 2024
4a74c4f
Fix primal_boundingbox_test
bmhan12 Jan 10, 2024
3cd3b95
Fix primal_clip_test
bmhan12 Jan 10, 2024
c83f46f
Simplify primal_boundingbox_test change
bmhan12 Jan 10, 2024
ed6862a
Fix primal_numeric_array, primal_point, primal_polyhedron tests
bmhan12 Jan 10, 2024
319ef1f
Fix primal_vector and primal_zip tests
bmhan12 Jan 11, 2024
644733f
Fix mint_execution_XXX_traversal tests (use Uniform policy)
bmhan12 Jan 17, 2024
e1619d6
Use unified memory for device insert core array for all example
bmhan12 Mar 12, 2024
2aa5f8a
Fix implicit grid tests - unified memory needed for getCandidates() call
bmhan12 Mar 12, 2024
a909870
Tentatively spin bvh tests working
bmhan12 Mar 12, 2024
678c84e
Use unified memory for distributed closest point
bmhan12 Mar 13, 2024
1d8b24b
Fix signed distance tests
bmhan12 Mar 13, 2024
64a7c4f
Fix quest discretize tests
bmhan12 Mar 13, 2024
084fab5
Guard unified memory init in core array for all
bmhan12 Mar 15, 2024
f9606d4
Cleanup spin_bvh warnings
bmhan12 Mar 20, 2024
870e0b9
spin implicit grid tests - generalize allocator ids on host
bmhan12 Mar 20, 2024
2981e1c
quest discretize tests - generalize allocator ids on host
bmhan12 Mar 20, 2024
db443af
Use Unified Memory for IntersectionShaper
bmhan12 Mar 26, 2024
25ddc10
Use unified memory for mesh_tester executable
bmhan12 Mar 27, 2024
cb22df3
spin_bvh_test - loop should run on host, some cleanup
bmhan12 Mar 27, 2024
395dcc1
WIP - fixing signed distance on HIP
bmhan12 Mar 29, 2024
4621eb6
Use unified for signed distance tests
bmhan12 Apr 1, 2024
d05d425
Generalize allocator ids for non-raja, non-umpire build
bmhan12 Apr 2, 2024
2650824
Handle +raja~umpire failure case
bmhan12 Apr 2, 2024
721865a
One more missing fix
bmhan12 Apr 3, 2024
3cd84a9
Add comment of unified id usage where preferred instead of mandatory
bmhan12 Apr 10, 2024
f3b82c2
Use explicit allocators for clarity
bmhan12 Apr 15, 2024
c356f24
Helper method for unified mint tests
bmhan12 Apr 15, 2024
db350d1
clarify variable name change
bmhan12 Apr 19, 2024
ab4292d
Use helper function for IntersectionShaper
bmhan12 Apr 23, 2024
011dbad
Remove if-block
bmhan12 May 21, 2024
7faa09d
Update notes with change
bmhan12 May 21, 2024
b2e8bd8
Merge branch 'develop' into feature/han12/device_default
bmhan12 May 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions RELEASE-NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ The Axom project release numbers follow [Semantic Versioning](http://semver.org/
- Primal: Makes several primitive methods available in device code

### Changed
- Set default Umpire allocator id to device instead of unified for CUDA and HIP execution policies.
- Upgrades `vcpkg` usage for axom's automated Windows builds to its
[2024.03.19 release](https://github.com/microsoft/vcpkg/releases/tag/2024.03.19).
Also updates vcpkg port versions for axom dependencies. Temporarily removes `umpire`
Expand Down
5 changes: 3 additions & 2 deletions src/axom/core/examples/core_acceleration.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,8 +182,7 @@ void demoAxomExecution()
//_gpu_atomic_start
using atomic_pol = typename axom::execution_space<ExecSpace>::atomic_policy;

int *sum =
axom::allocate<int>(1, axom::execution_space<ExecSpace>::allocatorID());
int *sum = axom::allocate<int>(1, allocator_id);
*sum = 0;

// Increment sum 100 times
Expand All @@ -193,6 +192,8 @@ void demoAxomExecution()

std::cout << "\nTotal Atomic Sum (" << axom::execution_space<ExecSpace>::name()
<< ") :" << sum[0] << std::endl;

axom::deallocate(sum);
//_gpu_atomic_end

#endif
Expand Down
4 changes: 2 additions & 2 deletions src/axom/core/execution/internal/cuda_exec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ struct execution_space<CUDA_EXEC<BLOCK_SIZE, SYNCHRONOUS>>
static constexpr char* name() noexcept { return (char*)"[CUDA_EXEC]"; }
static int allocatorID() noexcept
{
return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
}
};

Expand Down Expand Up @@ -93,7 +93,7 @@ struct execution_space<CUDA_EXEC<BLOCK_SIZE, ASYNC>>
}
static int allocatorID() noexcept
{
return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
}
};
} // namespace axom
Expand Down
4 changes: 2 additions & 2 deletions src/axom/core/execution/internal/hip_exec.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct execution_space<HIP_EXEC<BLOCK_SIZE, SYNCHRONOUS>>
static constexpr char* name() noexcept { return (char*)"[HIP_EXEC]"; }
static int allocatorID() noexcept
{
return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
}
};

Expand All @@ -88,7 +88,7 @@ struct execution_space<HIP_EXEC<BLOCK_SIZE, ASYNC>>
static constexpr char* name() noexcept { return (char*)"[HIP_EXEC] (async)"; }
static int allocatorID() noexcept
{
return axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
return axom::getUmpireResourceAllocatorID(umpire::resource::Device);
}
};
} // namespace axom
Expand Down
6 changes: 6 additions & 0 deletions src/axom/core/tests/core_array_for_all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1204,6 +1204,12 @@ AXOM_TYPED_TEST(core_array_for_all, device_insert)
typename TestFixture::template DynamicTArray<DynamicArray>;

int kernelAllocID = axom::execution_space<ExecSpace>::allocatorID();
#if defined(AXOM_USE_GPU) && defined(AXOM_USE_UMPIRE)
// Use unified memory for frequent movement between device operations
// and value checking on host
kernelAllocID = axom::getUmpireResourceAllocatorID(
umpire::resource::MemoryResourceType::Unified);
#endif

constexpr axom::IndexType N = 374;

Expand Down
20 changes: 12 additions & 8 deletions src/axom/core/tests/core_execution_for_all.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ void check_for_all()
constexpr int VALUE_2 = 42;
constexpr int N = 256;

// STEP 1: set default allocator for the execution space
const int currentAllocatorID = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(axom::execution_space<ExecSpace>::allocatorID());
// STEP 1: set allocators for the execution spaces
const int hostID = axom::execution_space<axom::SEQ_EXEC>::allocatorID();
const int allocID = axom::execution_space<ExecSpace>::allocatorID();

// STEP 0: allocate buffer
int* a = axom::allocate<int>(N);
int* a = axom::allocate<int>(N, allocID);

// STEP 1: initialize to VALUE_1
axom::for_all<ExecSpace>(
Expand All @@ -50,9 +50,12 @@ void check_for_all()
}

// STEP 2: check array
int* a_host = axom::allocate<int>(N, hostID);
axom::copy(a_host, a, N * sizeof(int));

for(int i = 0; i < N; ++i)
{
EXPECT_EQ(a[i], VALUE_1);
EXPECT_EQ(a_host[i], VALUE_1);
}

// STEP 3: add VALUE_2 to all entries resulting to zero
Expand All @@ -67,15 +70,16 @@ void check_for_all()
}

// STEP 4: check result
axom::copy(a_host, a, N * sizeof(int));

for(int i = 0; i < N; ++i)
{
EXPECT_EQ(a[i], 0);
EXPECT_EQ(a_host[i], 0);
}

// STEP 5: cleanup
axom::deallocate(a);

axom::setDefaultAllocator(currentAllocatorID);
axom::deallocate(a_host);
}

} /* end anonymous namespace */
Expand Down
12 changes: 4 additions & 8 deletions src/axom/core/tests/core_execution_space.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,8 +183,7 @@ TEST(core_execution_space, check_cuda_exec)
constexpr bool IS_ASYNC = false;
constexpr bool ON_DEVICE = true;

int allocator_id =
axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
check_execution_mappings<axom::CUDA_EXEC<BLOCK_SIZE>,
RAJA::cuda_exec<BLOCK_SIZE>,
RAJA::cuda_reduce,
Expand All @@ -204,8 +203,7 @@ TEST(core_execution_space, check_cuda_exec_async)
constexpr bool IS_ASYNC = true;
constexpr bool ON_DEVICE = true;

int allocator_id =
axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
check_execution_mappings<axom::CUDA_EXEC<BLOCK_SIZE, axom::ASYNC>,
RAJA::cuda_exec_async<BLOCK_SIZE>,
RAJA::cuda_reduce,
Expand All @@ -228,8 +226,7 @@ TEST(core_execution_space, check_hip_exec)
constexpr bool IS_ASYNC = false;
constexpr bool ON_DEVICE = true;

int allocator_id =
axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
check_execution_mappings<axom::HIP_EXEC<BLOCK_SIZE>,
RAJA::hip_exec<BLOCK_SIZE>,
RAJA::hip_reduce,
Expand All @@ -249,8 +246,7 @@ TEST(core_execution_space, check_hip_exec_async)
constexpr bool IS_ASYNC = true;
constexpr bool ON_DEVICE = true;

int allocator_id =
axom::getUmpireResourceAllocatorID(umpire::resource::Unified);
int allocator_id = axom::getUmpireResourceAllocatorID(umpire::resource::Device);
check_execution_mappings<axom::HIP_EXEC<BLOCK_SIZE, axom::ASYNC>,
RAJA::hip_exec_async<BLOCK_SIZE>,
RAJA::hip_reduce,
Expand Down
64 changes: 28 additions & 36 deletions src/axom/mint/tests/mint_execution_cell_traversals.cpp
bmhan12 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,22 @@ namespace mint
//------------------------------------------------------------------------------
namespace
{
#if defined(AXOM_USE_RAJA) && defined(AXOM_USE_UMPIRE) && \
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like all this code is the same as the code in the files for mint tests of face traversals and node traversals. It seems like it would save a lot of maintenance if it were centralized in one place and reused. @bmhan12 if you agree, this could be done in a subsequent PR.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added this as a task part of #1339

((defined(AXOM_USE_CUDA) && defined(RAJA_ENABLE_CUDA)) || \
(defined(AXOM_USE_HIP) && defined(RAJA_ENABLE_HIP)))

int set_um_memory_return_previous_allocator()
{
// Use unified memory
const int exec_space_id = axom::getUmpireResourceAllocatorID(
umpire::resource::MemoryResourceType::Unified);
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
return prev_allocator;
}

#endif

template <typename ExecPolicy, int MeshType, int Topology = SINGLE_SHAPE>
void check_for_all_cells_idx(int dimension)
{
Expand Down Expand Up @@ -378,9 +394,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_nodeids)

using cuda_exec = axom::CUDA_EXEC<512>;

const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cell_nodes<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cell_nodes<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand All @@ -396,9 +410,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_nodeids)

using hip_exec = axom::HIP_EXEC<512>;

const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cell_nodes<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cell_nodes<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand Down Expand Up @@ -441,9 +453,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_coords)

using cuda_exec = axom::CUDA_EXEC<512>;

const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cell_coords<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cell_coords<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand All @@ -459,9 +469,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_coords)

using hip_exec = axom::HIP_EXEC<512>;

const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cell_coords<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cell_coords<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand Down Expand Up @@ -504,9 +512,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_faceids)

using cuda_exec = axom::CUDA_EXEC<512>;

const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cell_faces<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cell_faces<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand All @@ -522,9 +528,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_faceids)

using hip_exec = axom::HIP_EXEC<512>;

const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cell_faces<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cell_faces<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand Down Expand Up @@ -561,9 +565,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ij)

using cuda_exec = axom::CUDA_EXEC<512>;

const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cells_ij<cuda_exec, STRUCTURED_UNIFORM_MESH>();
check_for_all_cells_ij<cuda_exec, STRUCTURED_CURVILINEAR_MESH>();
Expand All @@ -577,9 +579,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ij)

using hip_exec = axom::HIP_EXEC<512>;

const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cells_ij<hip_exec, STRUCTURED_UNIFORM_MESH>();
check_for_all_cells_ij<hip_exec, STRUCTURED_CURVILINEAR_MESH>();
Expand Down Expand Up @@ -612,9 +612,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ijk)

using cuda_exec = axom::CUDA_EXEC<512>;

const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cells_ijk<cuda_exec, STRUCTURED_UNIFORM_MESH>();
check_for_all_cells_ijk<cuda_exec, STRUCTURED_CURVILINEAR_MESH>();
Expand All @@ -628,9 +626,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_ijk)

using hip_exec = axom::HIP_EXEC<512>;

const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cells_ijk<hip_exec, STRUCTURED_UNIFORM_MESH>();
check_for_all_cells_ijk<hip_exec, STRUCTURED_CURVILINEAR_MESH>();
Expand Down Expand Up @@ -670,9 +666,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_index)

using cuda_exec = axom::CUDA_EXEC<512>;

const int exec_space_id = axom::execution_space<cuda_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cells_idx<cuda_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cells_idx<cuda_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand All @@ -688,9 +682,7 @@ AXOM_CUDA_TEST(mint_execution_cell_traversals, for_all_cells_index)

using hip_exec = axom::HIP_EXEC<512>;

const int exec_space_id = axom::execution_space<hip_exec>::allocatorID();
const int prev_allocator = axom::getDefaultAllocatorID();
axom::setDefaultAllocator(exec_space_id);
const int prev_allocator = set_um_memory_return_previous_allocator();

check_for_all_cells_idx<hip_exec, STRUCTURED_UNIFORM_MESH>(i);
check_for_all_cells_idx<hip_exec, STRUCTURED_CURVILINEAR_MESH>(i);
Expand Down