DPCT seems to generate incomplete codes for a template function #1252

jinz2014 · 2023-08-30T22:21:17Z

Migrating a function from CUDA to DPCT shows that the result is not complete (e.g. kernel name is missing). Please see the following code snippets from the program (https://github.com/zjin-lcf/HeCBench/blob/master/ssim-cuda/utils.h). Could you reproduce the issue ? Thanks.

CUDA

  template<typename K, typename T, typename... Types>
    inline void
    trilinear_kernel(K kernel, uint32_t shmem_size, cudaStream_t stream, T width, T height, T depth, Types... args)
    {
      if (width <= 0 || height <= 0 || depth <= 0) {
        return;
      }
      dim3 block_size(n_threads_trilinear, n_threads_trilinear, n_threads_trilinear);
      dim3 grid_size(n_blocks_trilinear(width), n_blocks_trilinear(height), n_blocks_trilinear(depth));
      kernel<<<grid_size, block_size, shmem_size, stream>>>((uint32_t)width, (uint32_t)height, (uint32_t)depth, args...);
    }

DPCT

      stream->parallel_for(
          sycl::nd_range<3>(grid_size * block_size, block_size),
          [=](sycl::nd_item<3> item_ct1) {
                ((uint32_t)width, (uint32_t)height, (uint32_t)depth, args...);
          });

The text was updated successfully, but these errors were encountered:

tomflinda · 2023-09-04T01:48:03Z

@jinz2014 reproduced, we will plan to fix it.
And the WA is to manfully apply the patch bellow to the migrated code of ssim-cuda.

diff --git a/ssim-cuda/out2/Makefile.dpct b/ssim-cuda/out2/Makefile.dpct
index fa9d1260..4e5bdd38 100644
--- a/ssim-cuda/out2/Makefile.dpct
+++ b/ssim-cuda/out2/Makefile.dpct
@@ -5,7 +5,7 @@ LD := $(CC)
 #DPCT2001:5: You can link with more library by add them here.
 LIB := 
 
-FLAGS := 
+FLAGS := -I./
 
 ifeq ($(shell which $(CC)),)
     $(error ERROR - $(CC) compiler not found)
diff --git a/ssim-cuda/out2/gdt/math/vec/functors.h b/ssim-cuda/out2/gdt/math/vec/functors.h
index c3fb2c7e..acff0fd4 100644
--- a/ssim-cuda/out2/gdt/math/vec/functors.h
+++ b/ssim-cuda/out2/gdt/math/vec/functors.h
@@ -148,9 +148,52 @@ namespace gdt {
                       fct(a.w,b.w));                                    \
   }
 
-                  _define_binary_functor(divRoundUp)
-                      _define_binary_functor(dpct::min(, ))
-                          _define_binary_functor(max)
+
+#define _define_binary_functor_2(fct)                                     \
+  template<typename T>                                                  \
+  __both__ vec_t<T,1> fct(const vec_t<T,1> &a, const vec_t<T,1> &b)     \
+  {                                                                     \
+    return vec_t<T,1>(dpct::fct(a.x,b.x));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,2> fct(const vec_t<T,2> &a, const vec_t<T,2> &b)     \
+  {                                                                     \
+    return vec_t<T,2>(dpct::fct(a.x,b.x),                                     \
+                      dpct::fct(a.y,b.y));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,3> fct(const vec_t<T,3> &a, const vec_t<T,3> &b)     \
+  {                                                                     \
+    return vec_t<T,3>(dpct::fct(a.x,b.x),                                     \
+                      dpct::fct(a.y,b.y),                                     \
+                      dpct::fct(a.z,b.z));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T1, typename T2>                                    \
+  __both__ vec_t<typename BinaryOpResultType<T1,T2>::type,3>            \
+  fct(const vec_t<T1,3> &a, const vec_t<T2,3> &b)                       \
+  {                                                                     \
+    return vec_t<typename BinaryOpResultType<T1,T2>::type,3>            \
+      (dpct::fct(a.x,b.x),                                                    \
+       dpct::fct(a.y,b.y),                                                    \
+       dpct::fct(a.z,b.z));                                                   \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,4> fct(const vec_t<T,4> &a, const vec_t<T,4> &b)     \
+  {                                                                     \
+    return vec_t<T,4>(dpct::fct(a.x,b.x),                                     \
+                      dpct::fct(a.y,b.y),                                     \
+                      dpct::fct(a.z,b.z),                                     \
+                      dpct::fct(a.w,b.w));                                    \
+  }
+
+
+  _define_binary_functor(divRoundUp)
+  _define_binary_functor_2(min)
+  _define_binary_functor(max)
 
 #undef _define_binary_functor
 
diff --git a/ssim-cuda/out2/main.dp.cpp b/ssim-cuda/out2/main.dp.cpp
index db069f8a..32526d1f 100644
--- a/ssim-cuda/out2/main.dp.cpp
+++ b/ssim-cuda/out2/main.dp.cpp
@@ -12,12 +12,15 @@
 
 using vec3i = gdt::vec3i;
 
+
+
+
 template<int WIN_SIZE>
 void 
-compute_ssim(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz,
+compute_ssim(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz, const sycl::nd_item<3> &item_ct1,
              float* __restrict__ _fx, float* __restrict__ _fy, vec3i gdims,              
              float data_range, float cov_norm, float K1, float K2,
-             float* __restrict__ out, const sycl::nd_item<3> &item_ct1)
+             float* __restrict__ out)
 {
   const int32_t x = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
                     item_ct1.get_local_id(2);
@@ -73,6 +76,28 @@ compute_ssim(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz,
   out[x + y * dimx + z * dimx * dimy] = S;
 }
 
+
+template<int WIN_SIZE>
+struct compute_ssim_2 {
+
+void operator()(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz, const sycl::nd_item<3> &item_ct1,
+             float* __restrict__ _fx, float* __restrict__ _fy, vec3i gdims,              
+             float data_range, float cov_norm, float K1, float K2,
+             float* __restrict__ out) const {
+
+
+compute_ssim<WIN_SIZE>(dimx, dimy, dimz, item_ct1,
+             _fx, _fy, gdims,              
+             data_range, cov_norm, K1, K2,
+             out);
+
+}
+
+};
+
+
+
+
 int main() try {
   dpct::device_ext &dev_ct1 = dpct::get_current_device();
   sycl::queue &q_ct1 = dev_ct1.default_queue();
@@ -141,7 +166,7 @@ int main() try {
     auto start = std::chrono::steady_clock::now();
 
     // calculate SSIM between reference and inference
-    util::trilinear_kernel(compute_ssim<win_size>, 0, &q_ct1, block.x, block.y,
+    util::trilinear_kernel(compute_ssim_2<win_size>(), 0, &q_ct1, block.x, block.y,
                            block.z, grid_reference, grid_inference, block_grid,
                            data_range, cov_norm, K1, K2, grid_output);
 
diff --git a/ssim-cuda/out2/utils.h b/ssim-cuda/out2/utils.h
index a442c732..a3fce868 100644
--- a/ssim-cuda/out2/utils.h
+++ b/ssim-cuda/out2/utils.h
@@ -102,7 +102,7 @@ namespace util {
       stream->parallel_for(
           sycl::nd_range<3>(grid_size * block_size, block_size),
           [=](sycl::nd_item<3> item_ct1) {
-                ((uint32_t)width, (uint32_t)height, (uint32_t)depth, args...);
+                kernel((uint32_t)width, (uint32_t)height, (uint32_t)depth, item_ct1, args...);
           });
     }
 
@@ -127,7 +127,7 @@ namespace util {
       stream->parallel_for(
           sycl::nd_range<3>(grid_size * block_size, block_size),
           [=](sycl::nd_item<3> item_ct1) {
-                (dims, args...);
+                kernel(dims, args...);
           });
     }

jinz2014 added the bug Something isn't working label Aug 30, 2023

zhimingwang36 self-assigned this Sep 12, 2023

ShengchenJ self-assigned this Sep 12, 2023

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

DPCT seems to generate incomplete codes for a template function #1252

DPCT seems to generate incomplete codes for a template function #1252

jinz2014 commented Aug 30, 2023

tomflinda commented Sep 4, 2023 •

edited

DPCT seems to generate incomplete codes for a template function #1252

DPCT seems to generate incomplete codes for a template function #1252

Comments

jinz2014 commented Aug 30, 2023

tomflinda commented Sep 4, 2023 • edited

tomflinda commented Sep 4, 2023 •

edited