Merge pull request #6885 from raffenet/4.2.x-cuda-no-device

[4.2.x] Fix potential crash with no GPU devices
pmodels · Jan 29, 2024 · e693b16 · e693b16
2 parents dae07fc + a408512
commit e693b16
Show file tree

Hide file tree

Showing 8 changed files with 116 additions and 47 deletions.
diff --git a/src/mpi/init/Makefile.mk b/src/mpi/init/Makefile.mk
@@ -17,6 +17,7 @@ mpi_core_sources += \
     src/mpi/init/init_async.c     \
     src/mpi/init/init_windows.c   \
     src/mpi/init/init_bindings.c  \
-    src/mpi/init/init_dbg_logging.c
+    src/mpi/init/init_dbg_logging.c \
+    src/mpi/init/init_gpu.c
 
 noinst_HEADERS += src/mpi/init/mpi_init.h
diff --git a/src/mpi/init/init_gpu.c b/src/mpi/init/init_gpu.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) by Argonne National Laboratory
+ *     See COPYRIGHT in top-level directory
+ */
+
+#include "mpiimpl.h"
+#include "mpi_init.h"
+
+/* Initialize gpu in mpl in order to support shm gpu module initialization
+ * inside MPID_Init. This also determines whether GPU support is requested
+ * from typerep. */
+/* FIXME: we should not be manipulating device-level CVARs from this layer */
+int MPII_init_gpu(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    if (MPIR_CVAR_ENABLE_GPU) {
+        int debug_summary = 0;
+        if (MPIR_CVAR_DEBUG_SUMMARY) {
+            debug_summary = (MPIR_Process.rank == 0);
+        }
+
+        bool specialized_cache =
+            (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE == MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE_specialized);
+
+        MPL_gpu_info.specialized_cache = specialized_cache;
+        MPL_gpu_info.use_immediate_cmdlist = MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST;
+        MPL_gpu_info.roundrobin_cmdq = MPIR_CVAR_GPU_ROUND_ROBIN_COMMAND_QUEUES;
+
+        int mpl_errno = MPL_gpu_init(debug_summary);
+        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
+
+        int device_count, max_dev_id, max_subdev_id;
+        mpi_errno = MPL_gpu_get_dev_count(&device_count, &max_dev_id, &max_subdev_id);
+        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
+
+        if (device_count <= 0) {
+            MPIR_CVAR_ENABLE_GPU = 0;
+        } else {
+            /* If the MPL backend doesn't support IPC, disable it for the upper layer */
+            if (!MPL_gpu_info.enable_ipc) {
+                MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD = -1;
+            }
+            /* If the MPL gpu backend doesn't support specialized cache, fallback to generic. */
+            if (specialized_cache && !MPL_gpu_info.specialized_cache) {
+                MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE = MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE_generic;
+            }
+        }
+    }
+
+  fn_fail:
+    return mpi_errno;
+}
+
+int MPII_finalize_gpu(void)
+{
+    int mpi_errno = MPI_SUCCESS;
+
+    if (MPIR_CVAR_ENABLE_GPU) {
+        int mpl_errno = MPL_gpu_finalize();
+        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_finalize");
+    }
+
+  fn_fail:
+    return mpi_errno;
+}
diff --git a/src/mpi/init/mpi_init.h b/src/mpi/init/mpi_init.h
@@ -67,6 +67,9 @@ int MPII_finalize_async(void);
 void MPII_Call_finalize_callbacks(int min_prio, int max_prio);
 void MPII_dump_debug_summary(void);
 
+int MPII_init_gpu(void);
+int MPII_finalize_gpu(void);
+
 /* MPI_Init[_thread]/MPI_Finalize only can be used in "world" model where it only
  * can be initialized and finalized once, while we can have multiple sessions.
  * Following inline functions are used to track the world model state in functions

diff --git a/src/mpi/init/mpir_init.c b/src/mpi/init/mpir_init.c
@@ -185,6 +185,8 @@ int MPII_Init_thread(int *argc, char ***argv, int user_required, int *provided,
      * other and can be initialized in any order. */
     /**********************************************************************/
 
+    mpi_errno = MPII_init_gpu();
+    MPIR_ERR_CHECK(mpi_errno);
     MPIR_context_id_init();
     MPIR_Typerep_init();
     MPII_thread_mutex_create();
@@ -236,42 +238,6 @@ int MPII_Init_thread(int *argc, char ***argv, int user_required, int *provided,
     MPIR_ThreadInfo.isThreaded = 0;
 #endif
 
-    /* Initialize gpu in mpl in order to support shm gpu module initialization
-     * inside MPID_Init */
-    if (MPIR_CVAR_ENABLE_GPU) {
-        int debug_summary = 0;
-        if (MPIR_CVAR_DEBUG_SUMMARY) {
-            debug_summary = (MPIR_Process.rank == 0);
-        }
-
-        bool specialized_cache =
-            (MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE == MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE_specialized);
-
-        MPL_gpu_info.specialized_cache = specialized_cache;
-        MPL_gpu_info.use_immediate_cmdlist = MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST;
-        MPL_gpu_info.roundrobin_cmdq = MPIR_CVAR_GPU_ROUND_ROBIN_COMMAND_QUEUES;
-
-        int mpl_errno = MPL_gpu_init(debug_summary);
-        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
-
-        int device_count, max_dev_id, max_subdev_id;
-        mpi_errno = MPL_gpu_get_dev_count(&device_count, &max_dev_id, &max_subdev_id);
-        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
-
-        if (device_count <= 0) {
-            MPIR_CVAR_ENABLE_GPU = 0;
-        } else {
-            /* If the MPL backend doesn't support IPC, disable it for the upper layer */
-            if (!MPL_gpu_info.enable_ipc) {
-                MPIR_CVAR_CH4_IPC_GPU_P2P_THRESHOLD = -1;
-            }
-            /* If the MPL gpu backend doesn't support specialized cache, fallback to generic. */
-            if (specialized_cache && !MPL_gpu_info.specialized_cache) {
-                MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE = MPIR_CVAR_CH4_IPC_GPU_HANDLE_CACHE_generic;
-            }
-        }
-    }
-
     mpi_errno = MPID_Init(required, &MPIR_ThreadInfo.thread_provided);
     MPIR_ERR_CHECK(mpi_errno);
 
@@ -482,10 +448,8 @@ int MPII_Finalize(MPIR_Session * session_ptr)
      * for atomic file updates makes this harder. */
     MPII_final_coverage_delay(rank);
 
-    if (MPIR_CVAR_ENABLE_GPU) {
-        int mpl_errno = MPL_gpu_finalize();
-        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_finalize");
-    }
+    mpi_errno = MPII_finalize_gpu();
+    MPIR_ERR_CHECK(mpi_errno);
 
     if (is_world_model) {
         mpi_errno = MPIR_nodeid_free();

diff --git a/src/mpl/configure.ac b/src/mpl/configure.ac
@@ -1194,6 +1194,7 @@ fi
 
 if test "X${pac_have_hip}" = "Xyes" ; then
     AC_DEFINE([HAVE_HIP],[1],[Define if HIP is available])
+    AC_CHECK_LIB(dl, dlopen, [], AC_MSG_ERROR([dlopen not found.  MPL HIP support requires libdl.]))
     have_gpu="yes"
     GPU_SUPPORT="HIP"
 fi

diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c
@@ -309,7 +309,14 @@ int MPL_gpu_init(int debug_summary)
     }
 
     cudaError_t ret = cudaGetDeviceCount(&device_count);
-    CUDA_ERR_CHECK(ret);
+    if (ret == cudaErrorNoDevice) {
+        /* call cudaGetLastError() to consume the error */
+        ret = cudaGetLastError();
+        assert(ret == cudaErrorNoDevice);
+        device_count = 0;
+    } else {
+        CUDA_ERR_CHECK(ret);
+    }
 
     if (device_count <= 0) {
         gpu_initialized = 1;
@@ -397,6 +404,7 @@ int MPL_gpu_finalize(void)
         free_hook_chain = free_hook_chain->next;
         MPL_free(prev);
     }
+    free_hook_chain = NULL;
     MPL_initlock_unlock(&free_hook_mutex);
 
     /* Reset initialization state */

diff --git a/src/mpl/src/gpu/mpl_gpu_hip.c b/src/mpl/src/gpu/mpl_gpu_hip.c
@@ -31,6 +31,7 @@ static gpu_free_hook_s *free_hook_chain = NULL;
 static hipError_t(*sys_hipFree) (void *dptr);
 
 static int gpu_mem_hook_init();
+static MPL_initlock_t free_hook_mutex = MPL_INITLOCK_INITIALIZER;
 
 int MPL_gpu_get_dev_count(int *dev_cnt, int *dev_id, int *subdevice_id)
 {
@@ -374,11 +375,14 @@ int MPL_gpu_finalize(void)
     MPL_free(global_to_local_map);
 
     gpu_free_hook_s *prev;
+    MPL_initlock_lock(&free_hook_mutex);
     while (free_hook_chain) {
         prev = free_hook_chain;
         free_hook_chain = free_hook_chain->next;
         MPL_free(prev);
     }
+    free_hook_chain = NULL;
+    MPL_initlock_unlock(&free_hook_mutex);
 
     /* Reset initialization state */
     gpu_initialized = 0;
@@ -455,21 +459,32 @@ int MPL_gpu_free_hook_register(void (*free_hook) (void *dptr))
     assert(hook_obj);
     hook_obj->free_hook = free_hook;
     hook_obj->next = NULL;
+
+    MPL_initlock_lock(&free_hook_mutex);
     if (!free_hook_chain)
         free_hook_chain = hook_obj;
     else {
         hook_obj->next = free_hook_chain;
         free_hook_chain = hook_obj;
     }
+    MPL_initlock_unlock(&free_hook_mutex);
 
     return MPL_SUCCESS;
 }
 
 hipError_t hipFree(void *dptr)
 {
     hipError_t result;
+    MPL_initlock_lock(&free_hook_mutex);
+
+    if (!sys_hipFree) {
+        gpu_mem_hook_init();
+    }
+
     gpu_free_hooks_cb(dptr);
     result = sys_hipFree(dptr);
+
+    MPL_initlock_lock(&free_hook_mutex);
     return result;
 }
 

diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c
@@ -178,6 +178,7 @@ typedef struct gpu_free_hook {
     void (*free_hook) (void *dptr);
     struct gpu_free_hook *next;
 } gpu_free_hook_s;
+static MPL_initlock_t free_hook_mutex = MPL_INITLOCK_INITIALIZER;
 
 pid_t mypid;
 
@@ -565,7 +566,9 @@ int MPL_gpu_init(int debug_summary)
 
     mypid = getpid();
 
+    MPL_initlock_lock(&free_hook_mutex);
     gpu_mem_hook_init();
+    MPL_initlock_unlock(&free_hook_mutex);
     gpu_initialized = 1;
 
     if (MPL_gpu_info.debug_summary) {
@@ -1265,11 +1268,14 @@ int MPL_gpu_finalize(void)
     MPL_free(physical_device_states);
 
     gpu_free_hook_s *prev;
+    MPL_initlock_lock(&free_hook_mutex);
     while (free_hook_chain) {
         prev = free_hook_chain;
         free_hook_chain = free_hook_chain->next;
         MPL_free(prev);
     }
+    free_hook_chain = NULL;
+    MPL_initlock_unlock(&free_hook_mutex);
 
     for (i = 0; i < local_ze_device_count; i++) {
         MPL_ze_device_entry_t *device_state = device_states + i;
@@ -2395,9 +2401,6 @@ static void gpu_free_hooks_cb(void *dptr)
 
 MPL_STATIC_INLINE_PREFIX int gpu_mem_hook_init(void)
 {
-    if (sys_zeMemFree)
-        return MPL_SUCCESS;
-
     void *libze_handle = dlopen("libze_loader.so", RTLD_LAZY | RTLD_GLOBAL);
     assert(libze_handle);
 
@@ -2413,12 +2416,15 @@ int MPL_gpu_free_hook_register(void (*free_hook) (void *dptr))
     assert(hook_obj);
     hook_obj->free_hook = free_hook;
     hook_obj->next = NULL;
+
+    MPL_initlock_lock(&free_hook_mutex);
     if (!free_hook_chain)
         free_hook_chain = hook_obj;
     else {
         hook_obj->next = free_hook_chain;
         free_hook_chain = hook_obj;
     }
+    MPL_initlock_unlock(&free_hook_mutex);
 
     return MPL_SUCCESS;
 }
@@ -2464,10 +2470,15 @@ __attribute__ ((visibility("default")))
 ze_result_t ZE_APICALL zeMemFree(ze_context_handle_t hContext, void *dptr)
 {
     ze_result_t result;
-    /* in case when MPI_Init was skipped */
-    gpu_mem_hook_init();
+    MPL_initlock_lock(&free_hook_mutex);
+    if (!sys_zeMemFree) {
+        gpu_mem_hook_init();
+    }
+
     gpu_free_hooks_cb(dptr);
     result = sys_zeMemFree(hContext, dptr);
+
+    MPL_initlock_unlock(&free_hook_mutex);
     return (result);
 }