LLNL · davideberius · Jun 5, 2019 · Jun 5, 2019 · Jun 5, 2019 · Jun 6, 2019
diff --git a/include/RAJA/policy/cuda/kernel/CudaKernel.hpp b/include/RAJA/policy/cuda/kernel/CudaKernel.hpp
@@ -301,8 +301,13 @@ struct CudaLaunchHelper<cuda_launch<async0, num_blocks, num_threads>,StmtList,Da
     }
   }
 
+  //
+  // actual_threads can be modified in the case where
+  // the requested number of theads would cause the
+  // max_blocks to be 0
+  //
   inline static void max_blocks(int shmem_size,
-      int &max_blocks, int actual_threads)
+      int &max_blocks, int &actual_threads)
   {
     auto func = internal::CudaKernelLauncher<Data, executor_t>;
 
@@ -314,6 +319,23 @@ struct CudaLaunchHelper<cuda_launch<async0, num_blocks, num_threads>,StmtList,Da
       internal::cuda_occupancy_max_blocks<Self>(
           func, shmem_size, max_blocks, actual_threads);
 
+      if(max_blocks == 0) {
+
+        //
+        // actual_threads is too high for the kernel to run
+        // successfully due to resource constraints reset to
+        // something that is possible
+        //
+        int min_grid_size, max_block_size;
+        // NOTE: min_grid_size is the minimum grid size needed
+        //       to achieve the best potential occupancy
+        cudaErrchk(cudaOccupancyMaxPotentialBlockSize(
+            &min_grid_size, &max_block_size, func, shmem_size));
+        actual_threads = max_block_size;
+        max_blocks = min_grid_size;
+      }
+
+
     } else {
 
       //
@@ -477,7 +499,17 @@ struct StatementExecutor<
       // Compute the MAX physical kernel blocks
       //
       int max_blocks;
-      launch_t::max_blocks(shmem, max_blocks, launch_dims.num_threads());
+      int adjusted_threads = launch_dims.num_threads();
+      launch_t::max_blocks(shmem, max_blocks, adjusted_threads);
+
+      //
+      // Redo fit with adjusted_threads
+      //
+      if(launch_dims.num_threads() != adjusted_threads) {
+        fit_threads = fitCudaDims(
+            adjusted_threads, launch_dims.threads, launch_dims.min_threads);
+        launch_dims.threads = fit_threads;
+      }
 
       int use_blocks;