Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DPCT seems to generate incomplete codes for a template function #1252

Open
jinz2014 opened this issue Aug 30, 2023 · 1 comment
Open

DPCT seems to generate incomplete codes for a template function #1252

jinz2014 opened this issue Aug 30, 2023 · 1 comment
Assignees
Labels
bug Something isn't working

Comments

@jinz2014
Copy link
Contributor

Migrating a function from CUDA to DPCT shows that the result is not complete (e.g. kernel name is missing). Please see the following code snippets from the program (https://github.com/zjin-lcf/HeCBench/blob/master/ssim-cuda/utils.h). Could you reproduce the issue ? Thanks.

CUDA

  template<typename K, typename T, typename... Types>
    inline void
    trilinear_kernel(K kernel, uint32_t shmem_size, cudaStream_t stream, T width, T height, T depth, Types... args)
    {
      if (width <= 0 || height <= 0 || depth <= 0) {
        return;
      }
      dim3 block_size(n_threads_trilinear, n_threads_trilinear, n_threads_trilinear);
      dim3 grid_size(n_blocks_trilinear(width), n_blocks_trilinear(height), n_blocks_trilinear(depth));
      kernel<<<grid_size, block_size, shmem_size, stream>>>((uint32_t)width, (uint32_t)height, (uint32_t)depth, args...);
    }

DPCT

      stream->parallel_for(
          sycl::nd_range<3>(grid_size * block_size, block_size),
          [=](sycl::nd_item<3> item_ct1) {
                ((uint32_t)width, (uint32_t)height, (uint32_t)depth, args...);
          });
@jinz2014 jinz2014 added the bug Something isn't working label Aug 30, 2023
@tomflinda
Copy link
Contributor

tomflinda commented Sep 4, 2023

@jinz2014 reproduced, we will plan to fix it.
And the WA is to manfully apply the patch bellow to the migrated code of ssim-cuda.

diff --git a/ssim-cuda/out2/Makefile.dpct b/ssim-cuda/out2/Makefile.dpct
index fa9d1260..4e5bdd38 100644
--- a/ssim-cuda/out2/Makefile.dpct
+++ b/ssim-cuda/out2/Makefile.dpct
@@ -5,7 +5,7 @@ LD := $(CC)
 #DPCT2001:5: You can link with more library by add them here.
 LIB := 
 
-FLAGS := 
+FLAGS := -I./
 
 ifeq ($(shell which $(CC)),)
     $(error ERROR - $(CC) compiler not found)
diff --git a/ssim-cuda/out2/gdt/math/vec/functors.h b/ssim-cuda/out2/gdt/math/vec/functors.h
index c3fb2c7e..acff0fd4 100644
--- a/ssim-cuda/out2/gdt/math/vec/functors.h
+++ b/ssim-cuda/out2/gdt/math/vec/functors.h
@@ -148,9 +148,52 @@ namespace gdt {
                       fct(a.w,b.w));                                    \
   }
 
-                  _define_binary_functor(divRoundUp)
-                      _define_binary_functor(dpct::min(, ))
-                          _define_binary_functor(max)
+
+#define _define_binary_functor_2(fct)                                     \
+  template<typename T>                                                  \
+  __both__ vec_t<T,1> fct(const vec_t<T,1> &a, const vec_t<T,1> &b)     \
+  {                                                                     \
+    return vec_t<T,1>(dpct::fct(a.x,b.x));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,2> fct(const vec_t<T,2> &a, const vec_t<T,2> &b)     \
+  {                                                                     \
+    return vec_t<T,2>(dpct::fct(a.x,b.x),                                     \
+                      dpct::fct(a.y,b.y));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,3> fct(const vec_t<T,3> &a, const vec_t<T,3> &b)     \
+  {                                                                     \
+    return vec_t<T,3>(dpct::fct(a.x,b.x),                                     \
+                      dpct::fct(a.y,b.y),                                     \
+                      dpct::fct(a.z,b.z));                                    \
+  }                                                                     \
+                                                                        \
+  template<typename T1, typename T2>                                    \
+  __both__ vec_t<typename BinaryOpResultType<T1,T2>::type,3>            \
+  fct(const vec_t<T1,3> &a, const vec_t<T2,3> &b)                       \
+  {                                                                     \
+    return vec_t<typename BinaryOpResultType<T1,T2>::type,3>            \
+      (dpct::fct(a.x,b.x),                                                    \
+       dpct::fct(a.y,b.y),                                                    \
+       dpct::fct(a.z,b.z));                                                   \
+  }                                                                     \
+                                                                        \
+  template<typename T>                                                  \
+  __both__ vec_t<T,4> fct(const vec_t<T,4> &a, const vec_t<T,4> &b)     \
+  {                                                                     \
+    return vec_t<T,4>(dpct::fct(a.x,b.x),                                     \
+                      dpct::fct(a.y,b.y),                                     \
+                      dpct::fct(a.z,b.z),                                     \
+                      dpct::fct(a.w,b.w));                                    \
+  }
+
+
+  _define_binary_functor(divRoundUp)
+  _define_binary_functor_2(min)
+  _define_binary_functor(max)
 
 #undef _define_binary_functor
 
diff --git a/ssim-cuda/out2/main.dp.cpp b/ssim-cuda/out2/main.dp.cpp
index db069f8a..32526d1f 100644
--- a/ssim-cuda/out2/main.dp.cpp
+++ b/ssim-cuda/out2/main.dp.cpp
@@ -12,12 +12,15 @@
 
 using vec3i = gdt::vec3i;
 
+
+
+
 template<int WIN_SIZE>
 void 
-compute_ssim(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz,
+compute_ssim(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz, const sycl::nd_item<3> &item_ct1,
              float* __restrict__ _fx, float* __restrict__ _fy, vec3i gdims,              
              float data_range, float cov_norm, float K1, float K2,
-             float* __restrict__ out, const sycl::nd_item<3> &item_ct1)
+             float* __restrict__ out)
 {
   const int32_t x = item_ct1.get_group(2) * item_ct1.get_local_range(2) +
                     item_ct1.get_local_id(2);
@@ -73,6 +76,28 @@ compute_ssim(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz,
   out[x + y * dimx + z * dimx * dimy] = S;
 }
 
+
+template<int WIN_SIZE>
+struct compute_ssim_2 {
+
+void operator()(const uint32_t dimx, const uint32_t dimy, const uint32_t dimz, const sycl::nd_item<3> &item_ct1,
+             float* __restrict__ _fx, float* __restrict__ _fy, vec3i gdims,              
+             float data_range, float cov_norm, float K1, float K2,
+             float* __restrict__ out) const {
+
+
+compute_ssim<WIN_SIZE>(dimx, dimy, dimz, item_ct1,
+             _fx, _fy, gdims,              
+             data_range, cov_norm, K1, K2,
+             out);
+
+}
+
+};
+
+
+
+
 int main() try {
   dpct::device_ext &dev_ct1 = dpct::get_current_device();
   sycl::queue &q_ct1 = dev_ct1.default_queue();
@@ -141,7 +166,7 @@ int main() try {
     auto start = std::chrono::steady_clock::now();
 
     // calculate SSIM between reference and inference
-    util::trilinear_kernel(compute_ssim<win_size>, 0, &q_ct1, block.x, block.y,
+    util::trilinear_kernel(compute_ssim_2<win_size>(), 0, &q_ct1, block.x, block.y,
                            block.z, grid_reference, grid_inference, block_grid,
                            data_range, cov_norm, K1, K2, grid_output);
 
diff --git a/ssim-cuda/out2/utils.h b/ssim-cuda/out2/utils.h
index a442c732..a3fce868 100644
--- a/ssim-cuda/out2/utils.h
+++ b/ssim-cuda/out2/utils.h
@@ -102,7 +102,7 @@ namespace util {
       stream->parallel_for(
           sycl::nd_range<3>(grid_size * block_size, block_size),
           [=](sycl::nd_item<3> item_ct1) {
-                ((uint32_t)width, (uint32_t)height, (uint32_t)depth, args...);
+                kernel((uint32_t)width, (uint32_t)height, (uint32_t)depth, item_ct1, args...);
           });
     }
 
@@ -127,7 +127,7 @@ namespace util {
       stream->parallel_for(
           sycl::nd_range<3>(grid_size * block_size, block_size),
           [=](sycl::nd_item<3> item_ct1) {
-                (dims, args...);
+                kernel(dims, args...);
           });
     }
 

@zhimingwang36 zhimingwang36 self-assigned this Sep 12, 2023
@ShengchenJ ShengchenJ self-assigned this Sep 12, 2023
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

4 participants