CUDA: volume renderer gradient kernel slow #1083

erizmr · 2023-03-29T16:08:54Z

erizmr
Mar 29, 2023

Hi, I am trying to use EnzymeAD differentiate a volume renderer. I installed Enzyme according to https://github.com/wsmoses/Enzyme-GPU-Tests and managed to get the program work according to the https://enzyme.mit.edu/getting_started/CUDAGuide/.

I profile the program and find that the forward kernel takes 5ms however the backward kernel takes 950ms, which is unexpected slow. I am not sure whether I missed something. The CUDA code of the forward and backward kernel is list below. It will be much appreciated if someone can help check whether I use the Enzyme here correctly. Thanks!

Forward kernel:

///////////// Enzyme /////////////
int __device__ enzyme_dup;
int __device__ enzyme_out;
int __device__ enzyme_const;

__device__ void volume_renderer(
    const float * __restrict__ sigmas, 
    const float * __restrict__ rgbs,
    const float * __restrict__ deltas,
    float * weights,
    float * weights_sum,
    float * depth,
    float * image, 
    const uint32_t M, const uint32_t N, const int * __restrict__ rays){

/// parallel per ray
    const uint32_t n = threadIdx.x + blockIdx.x * blockDim.x;
    if (n >= N) return;

    // locate 
    uint32_t index = rays[n * 3];
    uint32_t offset = rays[n * 3 + 1];
    uint32_t num_steps = rays[n * 3 + 2];

    // empty ray, or ray that exceed max step count.
    if (num_steps == 0 || offset + num_steps > M) {
        weights_sum[index] = 0;
        depth[index] = 0;
        image[index * 3] = 0;
        image[index * 3 + 1] = 0;
        image[index * 3 + 2] = 0;
        return;
    }

    sigmas += offset;
    rgbs += offset * 3;
    deltas += offset * 2;

    // accumulate 
    uint32_t step = 0;

    float T = 1.0f;
    float T_thresh = 0.0001f;
    float r = 0, g = 0, b = 0, ws = 0, t = 0, d = 0;

    while (step < num_steps) {

        const float alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
        const float weight = alpha * T;

        r += weight * rgbs[0];
        g += weight * rgbs[1];
        b += weight * rgbs[2];
        
        t += deltas[1]; // real delta
        d += weight * t;
        
        ws += weight;
        
        T *= 1.0f - alpha;

        // minimal remained transmittence
        if (T < T_thresh) break;

        //printf("[n=%d] num_steps=%d, alpha=%f, w=%f, T=%f, sum_dt=%f, d=%f\n", n, step, alpha, weight, T, sum_delta, d);

        // locate
        sigmas++;
        rgbs += 3;
        deltas += 2;

        step++;
    }

    //printf("[n=%d] rgb=(%f, %f, %f), d=%f\n", n, r, g, b, d);

    // write
    weights_sum[index] = ws; // weights_sum
    depth[index] = d;
    image[index * 3] = r;
    image[index * 3 + 1] = g;
    image[index * 3 + 2] = b;
}


void __global__  volume_renderer_wrapper(
    const float * __restrict__ sigmas, 
    const float * __restrict__ rgbs,
    const float * __restrict__ deltas,
    float * weights,
    float * weights_sum,
    float * depth,
    float * image, 
    const uint32_t M, const uint32_t N, const int * __restrict__ rays){
    volume_renderer(sigmas, rgbs, deltas, weights, weights_sum, depth, image, M, N, rays);
}

Backward kernel:

extern void  __device__ __enzyme_autodiff(void *, int, const float*, float*, int, const float*, float*, int, const float*, float*, int, const float*, float*, int, const float*, float*, int, const float*, float*, int, const float*, float*, int, const uint32_t, const uint32_t, const int * __restrict__);




void __global__ grad_volume_renderer_wrapper(const float * __restrict__ sigmas,
                                  float * grad_sigmas,
                                  const float * __restrict__ rgbs,
                                  float * grad_rgbs,
                                  const float * __restrict__ deltas,
                                  float * grad_deltas,
                                  float * weights,
                                  float * grad_weights,
                                  float * weights_sum,
                                  float * grad_weights_sum,
                                  float * depth,
                                  float * grad_depth,
                                  float * image, 
                                  float * grad_image,
                                  const uint32_t M,
                                  const uint32_t N,
                                  const int * __restrict__ rays
                                  ){

    __enzyme_autodiff((void*)volume_renderer, 
                             enzyme_dup, sigmas, grad_sigmas, 
                             enzyme_dup, rgbs, grad_rgbs,
                             enzyme_dup, deltas, grad_deltas,
                             enzyme_dup, weights, grad_weights,
                             enzyme_dup, weights_sum, grad_weights_sum,
                             enzyme_dup, depth, grad_depth,
                             enzyme_dup, image, grad_image,
                             enzyme_const, M, N, rays);
}

Answered by wsmoses

Mar 31, 2023

You can add -mllvm -enzyme-max-cache which changes the allocations from dynamic reallocs to static malloc, which should partially improve things -- though those two remaining allocations still persist and can cause some performance issues.

@michel2323 @sriharikrishna this looks like a good example of why we should move some more of the checkpointing work into Enzyme proper. cc @vchuravy

View full answer

ZuseZ4 · 2023-03-29T16:45:43Z

ZuseZ4
Mar 29, 2023
Collaborator

Haven't used the cuda side myself, but I guess that is expected based on your compilation command.
It is also mentioned somewhere down on the page:

'''
Note that this procedure (using ClangEnzyme as opposed to LLVMEnzyme manually) may not properly nest Enzyme between optimization passes and may impact performance in unintended ways.
'''

The reason is that we disable some optimizations before running Enzyme and on Top of that emit mostly unoptimized code that is intended to be easily optimizable. The second optimization run is however missing here.

I am currently on my phone so I can't build you the full command, but maybe have a look at the pure C++ example with lld and LLDEnzyme or opt and LLVMEnzyme and try to add the corresponding cuda flags there along the way.

0 replies

erizmr · 2023-03-29T17:15:56Z

erizmr
Mar 29, 2023
Author

Hi @ZuseZ4 , thanks for the timely reply! As for the compilation command. I mainly follow the Makefile in this example https://github.com/wsmoses/Enzyme-GPU-Tests/blob/main/LBM/Makefile . It will be great if you can share some sample command please when you are convenient. Thanks.

0 replies

tgymnich · 2023-03-29T17:25:37Z

tgymnich
Mar 29, 2023
Collaborator

Again unrelated to CUDA. Do you need all the primals for weights, weights_sum, depth, image? If thats not the case you could selectively disable some of the primals by using enzyme_dupnoneed instead of enzyme_dup

0 replies

wsmoses · 2023-03-29T18:01:49Z

wsmoses
Mar 29, 2023
Maintainer

Can you post a zip file containing the code/makefile/etc?

The Makefile you link to has many different options to be able to run an ablation analysis to test how effective optimizations are.

0 replies

erizmr · 2023-03-31T09:26:46Z

erizmr
Mar 31, 2023
Author

Hi @wsmoses , here is the zip containing code, makefile, test data and how to repro. It will be great if you can help take a look. Thanks!
volume_renderer_repro.zip

0 replies

erizmr · 2023-03-31T09:32:00Z

erizmr
Mar 31, 2023
Author

Thanks for the suggestions @tgymnich. I was a beginner in EnzymeAD and not familiar with the conventions so I just set everything to enzyme_dup to make sure it works. Though most of the variables require gradients, I can try to make some optimizations follow your suggestions.

0 replies

wsmoses · 2023-03-31T13:33:21Z

wsmoses
Mar 31, 2023
Maintainer

If you add -Rpass=enzyme you get performance warnings, in this case the following:

src/raymarching.cu:67:14: remark: SE could not compute loop limit of while.cond of diffe_Z15volume_rendererPKfS0_S0_PfS1_S1_S1_jjPKilim: ***COULDNOTCOMPUTE*** maxlim: ***COULDNOTCOMPUTE*** [-Rpass=enzyme]
    uint32_t step = 0;
             ^
src/raymarching.cu:75:45: remark: Load may need caching   %6 = load float, float* %sigmas.addr.0, align 4, !dbg !67, !tbaa !44 due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        const float alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
                                            ^
src/raymarching.cu:75:57: remark: Load may need caching   %7 = load float, float* %deltas.addr.0, align 4, !dbg !69, !tbaa !44 due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        const float alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
                                                        ^
src/raymarching.cu:78:23: remark: Load may need caching   %10 = load float, float* %rgbs.addr.0, align 4, !dbg !77, !tbaa !44 due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        r += weight * rgbs[0];
                      ^
src/raymarching.cu:79:23: remark: Load may need caching   %11 = load float, float* %arrayidx47, align 4, !dbg !80, !tbaa !44 due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        g += weight * rgbs[1];
                      ^
src/raymarching.cu:80:23: remark: Load may need caching   %12 = load float, float* %arrayidx50, align 4, !dbg !83, !tbaa !44 due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        b += weight * rgbs[2];
                      ^
src/raymarching.cu:82:14: remark: Load may need caching   %13 = load float, float* %arrayidx53, align 4, !dbg !86, !tbaa !44 due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        t += deltas[1]; // real delta
             ^
note: could not determine the original source location for src/raymarching.cu:0:0
remark: Caching instruction   %T.0 = phi float [ 1.000000e+00, %if.end31 ], [ %mul59, %cleanup ], !dbg !91 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
note: could not determine the original source location for src/raymarching.cu:0:0
src/raymarching.cu:75:57: remark: Load must be recomputed   %7 = load float, float* %deltas.addr.0, align 4, !dbg !69, !tbaa !44 in reverse_invertwhile.body due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        const float alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
                                                        ^
src/raymarching.cu:75:57: remark: Caching instruction   %12 = load float, float* %deltas.addr.0, align 4, !dbg !98, !tbaa !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
src/raymarching.cu:75:45: remark: Load must be recomputed   %6 = load float, float* %sigmas.addr.0, align 4, !dbg !67, !tbaa !44 in reverse_invertwhile.body due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        const float alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
                                            ^
src/raymarching.cu:75:45: remark: Caching instruction   %14 = load float, float* %sigmas.addr.0, align 4, !dbg !96, !tbaa !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
src/raymarching.cu:82:11: remark: Caching instruction   %add54 = fadd fast float %28, %t.0, !dbg !120 legalRecompute: 1 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
        t += deltas[1]; // real delta
          ^
src/raymarching.cu:64:12: remark: Caching instruction   %6 = phi float* [ %"add.ptr37'ipg", %if.end31 ], [ %35, %cleanup ], !dbg !88 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
    deltas += offset * 2;
           ^
src/raymarching.cu:80:23: remark: Load must be recomputed   %12 = load float, float* %arrayidx50, align 4, !dbg !83, !tbaa !44 in reverse_invertwhile.body due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        b += weight * rgbs[2];
                      ^
src/raymarching.cu:80:23: remark: Caching instruction   %36 = load float, float* %arrayidx50, align 4, !dbg !118, !tbaa !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
src/raymarching.cu:63:10: remark: Caching instruction   %7 = phi float* [ %"add.ptr34'ipg", %if.end31 ], [ %47, %cleanup ], !dbg !86 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
    rgbs += offset * 3;
         ^
src/raymarching.cu:79:23: remark: Load must be recomputed   %11 = load float, float* %arrayidx47, align 4, !dbg !80, !tbaa !44 in reverse_invertwhile.body due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        g += weight * rgbs[1];
                      ^
src/raymarching.cu:79:23: remark: Caching instruction   %44 = load float, float* %arrayidx47, align 4, !dbg !117, !tbaa !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
src/raymarching.cu:78:23: remark: Load must be recomputed   %10 = load float, float* %rgbs.addr.0, align 4, !dbg !77, !tbaa !44 in reverse_invertwhile.body due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        r += weight * rgbs[0];
                      ^
src/raymarching.cu:78:23: remark: Caching instruction   %46 = load float, float* %rgbs.addr.0, align 4, !dbg !114, !tbaa !45 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
src/raymarching.cu:62:12: remark: Caching instruction   %8 = phi float* [ %"add.ptr'ipg", %if.end31 ], [ %64, %cleanup ], !dbg !84 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
    sigmas += offset;
           ^
src/raymarching.cu:75:57: remark: Load must be recomputed   %7 = load float, float* %deltas.addr.0, align 4, !dbg !69, !tbaa !44 in reverse_invertcleanup due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        const float alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
                                                        ^
src/raymarching.cu:75:45: remark: Load must be recomputed   %6 = load float, float* %sigmas.addr.0, align 4, !dbg !67, !tbaa !44 in reverse_invertcleanup due to   store float %ws.1, float* %arrayidx67, align 4, !dbg !102, !tbaa !44 [-Rpass=enzyme]
        const float alpha = 1.0f - __expf(- sigmas[0] * deltas[0]);
                                            ^
note: could not determine the original source location for src/raymarching.cu:0:0
remark: Caching instruction   %"cmp38!manual_lcssa" = phi i1 [ %cmp38, %cleanup ], [ %cmp38, %while.cond ], !dbg !91 legalRecompute: 1 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
note: could not determine the original source location for src/raymarching.cu:0:0

0 replies

wsmoses · 2023-03-31T13:36:47Z

wsmoses
Mar 31, 2023
Maintainer

From these perf warnings, it looks like LLVM optimizations aren't happy about your indexing, changing it to something like the following improves it:

    while (step < num_steps) {

        const float alpha = 1.0f - __expf(- sigmas[step] * deltas[2*step]);
        const float weight = alpha * T;

        r += weight * rgbs[3*step+0];
        g += weight * rgbs[3*step+1];
        b += weight * rgbs[3*step+2];
        
        t += deltas[2*step+1]; // real delta
        d += weight * t;
        
        ws += weight;
        
        T *= 1.0f - alpha;

        // minimal remained transmittence
        if (T < T_thresh) break;

        //printf("[n=%d] num_steps=%d, alpha=%f, w=%f, T=%f, sum_dt=%f, d=%f\n", n, step, alpha, weight, T, sum_delta, d);

        // locate
        // sigmas++;
        // rgbs += 3;
        // deltas += 2;

        step++;
    }

The performance warnings now become the following, which are far fewer, but still requires caching for the T and related variable:

src/raymarching.cu:67:14: remark: SE could not compute loop limit of while.cond of diffe_Z15volume_rendererPKfS0_S0_PfS1_S1_S1_jjPKilim: ***COULDNOTCOMPUTE*** maxlim: %5 [-Rpass=enzyme]
    uint32_t step = 0;
             ^
note: could not determine the original source location for src/raymarching.cu:0:0
remark: Caching instruction   %T.0 = phi float [ 1.000000e+00, %if.end31 ], [ %mul71, %while.body ], !dbg !91 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
note: could not determine the original source location for src/raymarching.cu:0:0
src/raymarching.cu:82:11: remark: Caching instruction   %add66 = fadd fast float %19, %t.0, !dbg !123 legalRecompute: 1 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
        t += deltas[1]; // real delta
          ^
note: could not determine the original source location for src/raymarching.cu:0:0
remark: Caching instruction   %"!manual_lcssa" = phi i32 [ %14, %while.body ], [ %14, %while.cond ], !dbg !92 legalRecompute: 0 shouldRecompute: 0 tryLegalRecomputeCheck: 1 [-Rpass=enzyme]
note: could not determine the original source location for src/raymarching.cu:0:0

0 replies

wsmoses · 2023-03-31T13:39:47Z

wsmoses
Mar 31, 2023
Maintainer

You can add -mllvm -enzyme-max-cache which changes the allocations from dynamic reallocs to static malloc, which should partially improve things -- though those two remaining allocations still persist and can cause some performance issues.

@michel2323 @sriharikrishna this looks like a good example of why we should move some more of the checkpointing work into Enzyme proper. cc @vchuravy

0 replies

erizmr · 2023-04-01T09:50:39Z

erizmr
Apr 1, 2023
Author

Hi @wsmoses , thanks for your help! After I modified the source code and compilation flags according to your suggestions, the time cost of the backward kernel reduce to ~6 ms, which is much more sensible to me. There are still two points which confused me.

As the performance gain is huge, I am wondering what is the key reason that improves the performance? Is it removing some local caches for sigmas, rgbs and delatas?
Another observation is that when I use a larger batch size such as 8192, the backward kernel crashed with what(): CUDA error: an illegal memory access was encountered if I didn't change the indexing of the source code and add the flag -mllvm -enzyme-max-cache as you suggested. I am wondering what are the potential causes?

Thanks!

2 replies

erizmr Apr 6, 2023
Author

Hi @wsmoses , thanks for your help! After I modified the source code and compilation flags according to your suggestions, the time cost of the backward kernel reduce to ~6 ms, which is much more sensible to me. There are still two points which confused me.

As the performance gain is huge, I am wondering what is the key reason that improves the performance? Is it removing some local caches for sigmas, rgbs and delatas?

Another observation is that when I use a larger batch size such as 8192, the backward kernel crashed with what(): CUDA error: an illegal memory access was encountered if I didn't change the indexing of the source code and add the flag -mllvm -enzyme-max-cache as you suggested. I am wondering what are the potential causes?

Thanks!

Just wondering if anyone can share further insights about the questions here please? Thanks a lot!

tgymnich Apr 6, 2023
Collaborator

As the performance gain is huge, I am wondering what is the key reason that improves the performance? Is it removing some local caches for sigmas, rgbs and delatas?

I would assume the speedup is due to fewer calls to malloc.

Another observation is that when I use a larger batch size such as 8192, the backward kernel crashed with what(): CUDA error: an illegal memory access was encountered if I didn't change the indexing of the source code and add the flag -mllvm -enzyme-max-cache as you suggested. I am wondering what are the potential causes?

Is it possible that the cache becomes larger than your GPU memory?
Try running a tool like the compute-sanitizer to try to debug this.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CUDA: volume renderer gradient kernel slow #1083

{{title}}

{{editor}}'s edit

{{editor}}'s edit

Replies: 10 comments 2 replies

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

{{title}}

Select a reply

CUDA: volume renderer gradient kernel slow #1083

erizmr Mar 29, 2023

Replies: 10 comments · 2 replies

ZuseZ4 Mar 29, 2023 Collaborator

erizmr Mar 29, 2023 Author

tgymnich Mar 29, 2023 Collaborator

wsmoses Mar 29, 2023 Maintainer

erizmr Mar 31, 2023 Author

erizmr Mar 31, 2023 Author

wsmoses Mar 31, 2023 Maintainer

wsmoses Mar 31, 2023 Maintainer

wsmoses Mar 31, 2023 Maintainer

erizmr Apr 1, 2023 Author

erizmr Apr 6, 2023 Author

tgymnich Apr 6, 2023 Collaborator

erizmr
Mar 29, 2023

Replies: 10 comments 2 replies

ZuseZ4
Mar 29, 2023
Collaborator

erizmr
Mar 29, 2023
Author

tgymnich
Mar 29, 2023
Collaborator

wsmoses
Mar 29, 2023
Maintainer

erizmr
Mar 31, 2023
Author

erizmr
Mar 31, 2023
Author

wsmoses
Mar 31, 2023
Maintainer

wsmoses
Mar 31, 2023
Maintainer

wsmoses
Mar 31, 2023
Maintainer

erizmr
Apr 1, 2023
Author

erizmr Apr 6, 2023
Author

tgymnich Apr 6, 2023
Collaborator