Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

OpenCL backend produces Syntax Error #247

Open
fkellner opened this issue Apr 27, 2023 · 5 comments
Open

OpenCL backend produces Syntax Error #247

fkellner opened this issue Apr 27, 2023 · 5 comments
Labels
bug Something isn't working

Comments

@fkellner
Copy link

Describe the bug

Running a task with @Parallel-annotated for-loops calculating a simple low-pass filter produces an OpenCL syntax error when run on Apple M1 or Windows.

The offending method reads as follows:

public static void calcLowpassAtRB(float[] samples, float[] lowPassAtRB, int width, int height, int green1Idx, int green2Idx) {
        for(@Parallel int x = 0; x < width; x++) {
            for(@Parallel int y = 0; y < height; y++) {
                int patternIdx = (x % 2) + 2 * (y % 2);
                if (patternIdx == green1Idx || patternIdx == green2Idx) {
                    lowPassAtRB[x + y * width] = 0.0f;
                } else {
                    // 3x3 low-pass filter

                    float middle = samples[x + y * width];

                    float top =         samples[x + max(0, y - 1) * width];
                    float bottom =      samples[x + min(height - 1, y + 1) * width];
                    float left =        samples[max(0, x - 1) + y * width];
                    float right =       samples[min(width - 1, x + 1) + y * width];

                    float topLeft =     samples[max(0, x - 1) + max(0, y - 1) * width];
                    float topRight =    samples[min(width - 1, x + 1) + max(0, y - 1) * width];
                    float bottomLeft =  samples[max(0, x - 1) + min(height - 1, y + 1) * width];
                    float bottomRight = samples[min(width - 1, x + 1) + min(height - 1, y + 1) * width];

                    lowPassAtRB[x + y * width] = 0.25f * middle +
                            0.125f * (top + bottom + left + right) +
                            0.0625f * (topLeft + topRight + bottomLeft + bottomRight);
                }
            }
        }
    }

   public static float min(float a, float b) {
        return a < b ? a : b;
    }

    public static float max(float a, float b) {
        return a > b ? a : b;
    }

    public static float abs(float n) {
        return n < 0 ? -n : n;
    }

    public static int min(int a, int b) {
        return a < b ? a : b;
    }

    public static int max(int a, int b) {
        return a > b ? a : b;
    }

    public static int abs(int n) {
        return n < 0 ? -n : n;
    }

Running it as a task graph on an Apple Mac Mini (only available device is M1 chip) with --fullDebug and --printKernel results in the following error:

__kernel void calcLowpassAtRB(__global long *_kernel_context, __constant uchar *_constant_region, __local uchar *_local_region, __global int *_atomics, __global uchar *samples, __global uchar *lowPassAtRB, __private int width, __private int height, __private int green1Idx, __private int green2Idx)
{
  ulong ul_73, ul_39, ul_55, ul_21, ul_67, ul_1, ul_15, ul_47, ul_79, ul_0, ul_61, ul_30; 
  float f_74, f_48, f_80, f_81, f_68, f_40, f_62, f_31, f_82, f_83, f_84, f_85, f_22, f_86, f_87, f_56, f_88, f_89; 
  int i_50, i_51, i_57, i_42, i_43, i_49, i_4, i_5, i_69, i_2, i_3, i_8, i_9, i_6, i_7, i_63, i_18, i_24, i_25, i_23, i_10, i_11, i_75, i_16, i_17, i_34, i_35, i_41, i_26, i_90, i_91, i_32, i_33; 
  long l_36, l_37, l_38, l_27, l_28, l_29, l_52, l_53, l_54, l_44, l_45, l_46, l_64, l_65, l_66, l_70, l_71, l_58, l_59, l_60, l_72, l_12, l_76, l_13, l_77, l_14, l_78; 
  bool z_19, z_20; 

  // BLOCK 0
  ul_0  =  (ulong) samples;
  ul_1  =  (ulong) lowPassAtRB;
  i_2  =  get_global_size(0);
  i_3  =  get_global_size(1);
  i_4  =  get_global_id(0);
  i_5  =  get_global_id(1);
  // BLOCK 1 MERGES [0 11 ]
  i_6  =  i_5;
  for(;i_6 < 2144;)
  {
    // BLOCK 2
    i_7  =  i_6 % 2;
    // BLOCK 3 MERGES [2 10 ]
    i_8  =  i_4;
    for(;i_8 < 1560;)
    {
      // BLOCK 4
      i_9  =  i_2 + i_8;
      i_10  =  i_8 * 2144;
      i_11  =  i_10 + i_6;
      l_12  =  (long) i_11;
      l_13  =  l_12 << 2;
      l_14  =  l_13 + 16L;
      ul_15  =  ul_1 + l_14;
      i_16  =  i_8 % 2;
      i_17  =  i_16 << 1;
      i_18  =  i_17 + i_7;
      z_19  =  i_18 == 1;
      if(z_19)
      {
        // BLOCK 5
      }  // B5
      else
      {
        // BLOCK 6
        z_20  =  i_18 == 2;
        if(z_20)
        {
          // BLOCK 7
        }  // B7
        else
        {
          // BLOCK 9
          ul_21  =  ul_0 + l_14;
          f_22  =  *((__global float *) ul_21);
          i_23  =  i_8 + -1;
          i_24  =  (i_23 < 0) ? 0 : i_23;
          i_25  =  i_24 * 2144;
          i_26  =  i_25 + i_6;
          l_27  =  (long) i_26;
          l_28  =  l_27 << 2;
          l_29  =  l_28 + 16L;
          ul_30  =  ul_0 + l_29;
          f_31  =  *((__global float *) ul_30);
          i_32  =  i_8 + 1;
          i_33  =  (i_32 < 1560) ? i_32 : 1559;
          i_34  =  i_33 * 2144;
          i_35  =  i_34 + i_6;
          l_36  =  (long) i_35;
          l_37  =  l_36 << 2;
          l_38  =  l_37 + 16L;
          ul_39  =  ul_0 + l_38;
          f_40  =  *((__global float *) ul_39);
          i_41  =  i_6 + -1;
          i_42  =  (i_41 < 0) ? 0 : i_41;
          i_43  =  i_42 + i_10;
          l_44  =  (long) i_43;
          l_45  =  l_44 << 2;
          l_46  =  l_45 + 16L;
          ul_47  =  ul_0 + l_46;
          f_48  =  *((__global float *) ul_47);
          i_49  =  i_6 + 1;
          i_50  =  (i_49 < 2144) ? i_49 : 2143;
          i_51  =  i_50 + i_10;
          l_52  =  (long) i_51;
          l_53  =  l_52 << 2;
          l_54  =  l_53 + 16L;
          ul_55  =  ul_0 + l_54;
          f_56  =  *((__global float *) ul_55);
          i_57  =  i_42 + i_25;
          l_58  =  (long) i_57;
          l_59  =  l_58 << 2;
          l_60  =  l_59 + 16L;
          ul_61  =  ul_0 + l_60;
          f_62  =  *((__global float *) ul_61);
          i_63  =  i_25 + i_50;
          l_64  =  (long) i_63;
          l_65  =  l_64 << 2;
          l_66  =  l_65 + 16L;
          ul_67  =  ul_0 + l_66;
          f_68  =  *((__global float *) ul_67);
          i_69  =  i_42 + i_34;
          l_70  =  (long) i_69;
          l_71  =  l_70 << 2;
          l_72  =  l_71 + 16L;
          ul_73  =  ul_0 + l_72;
          f_74  =  *((__global float *) ul_73);
          i_75  =  i_34 + i_50;
          l_76  =  (long) i_75;
          l_77  =  l_76 << 2;
          l_78  =  l_77 + 16L;
          ul_79  =  ul_0 + l_78;
          f_80  =  *((__global float *) ul_79);
          f_81  =  f_62 + f_68;
          f_82  =  f_81 + f_74;
          f_83  =  f_82 + f_80;
          f_84  =  f_31 + f_40;
          f_85  =  f_84 + f_48;
          f_86  =  f_85 + f_56;
          f_87  =  f_86 * 0.125F;
          f_88  =  fma(f_22, 0.25F, f_87);
          f_89  =  fma(f_83, 0.0625F, f_88);
          *((__global float *) ul_15)  =  f_89;
        }  // B9
      }  // B6
      // BLOCK 8 MERGES [5 7 ]
      *((__global float *) ul_15)  =  0.0F;
    }  // B8
    // BLOCK 10 MERGES [9 8 ]
    i_90  =  i_9;
    i_8  =  i_90;
  }  // B10
  // BLOCK 11
  i_91  =  i_3 + i_6;
  i_6  =  i_91;
}  // B11
// BLOCK 12
return;
}  //  kernel

UNSUPPORTED (log once): buildComputeProgram: cl2Metal failed
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> [CL_DEVICE_NOT_AVAILABLE] : OpenCL Error : Error: Build Program driver returned (10014)
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> OpenCL Warning : clBuildProgram failed: could not build program for 0x1027f00 (Apple M1) (err:-2)
[JNI] uk.ac.manchester.tornado.drivers.opencl> notify error:
[JNI] uk.ac.manchester.tornado.drivers.opencl> [CL_BUILD_ERROR] : OpenCL Build Error : Compiler build log:
Compilation failed: 

program_source:137:1: error: expected identifier or '('
return;
^
program_source:138:1: error: extraneous closing brace ('}')
}  //  kernel
^


[TornadoVM-OCL-JNI] ERROR : clBuildProgram -> Returned: -11
�[31mError during code compilation with the OpenCL driver�[0m
�[31mUnable to compile task task demosaicing.lowPassAtRB - calcLowpassAtRB

Running the same on a Windows 10 machine with OpenCL Backends also results in the error [TornadoVM-OCL-JNI] ERROR : clBuildProgram -> Returned: -11, however without giving any details like above.

How To Reproduce

We are currently still working out how to open source the surrounding program. If it helps, I can privately send you a copy of the code with instructions. But maybe I just overlooked the use of some not-yet-implemented method, or something like that.

Expected behavior

The task to compile and run.

Computing system setup (please complete the following information):

  • OS: Darwin
  • Output of tornado --devices:
Number of Tornado drivers: 1
Driver: OpenCL
  Total number of OpenCL devices  : 1
  Tornado device=0:0  (DEFAULT)
	OPENCL --  [Apple] -- Apple M1
		Global Memory Size: 5.3 GB
		Local Memory Size: 32.0 KB
		Workgroup Dimensions: 3
		Total Number of Block Threads: [256]
		Max WorkGroup Configuration: [256, 256, 256]
		Device OpenCL C version: OpenCL C 1.2

Additional context

Same errorcode -11 on Windows 10, but without line number and additional information.


@jjfumero
Copy link
Member

Hi @fkellner. Thank you for the report. This is a bug in TornadoVM. It generates an OpenCL kernel with an extra "}" bracket. It will help if you can send us a test case (minimal code) that breaks.

This error should not happen if you use the PTX or SPIR-V backend. The PTX backend is supported on Windows via MSys2. The SPIR-V backend is only supported on Linux.

One thing, you can use TornadoMath functions (e.g., TornadoMath.max, etc) to access these functions. The TornadoMath class is known by the JIT compiler and it generates the OpenCL/PTX and SPIR-V intrinsic for fast-math operations.

@jjfumero jjfumero added the bug Something isn't working label Apr 27, 2023
@fkellner
Copy link
Author

Thanks for the quick reply and the tip!

I cannot test with PTX or SPIR-V, because I have no access to a device which supports them.

However, I tried to create a minimal example. When trying to reproduce the error, I received a different error, which I also received when executing the code that previously generated this error (even though I changed nothing) (see below, or check the .txt file in the repo).

The minimal example is available here: https://github.com/fkellner/tornadovm-debug

tornado --fullDebug --printKernel -jar Test-0.1-SNAPSHOT-jar-with-dependencies.jar
WARNING: Using incubator modules: jdk.incubator.vector, jdk.incubator.foreign
Starting Test
Loading DRIVER: uk.ac.manchester.tornado.drivers.opencl.OCLTornadoDriverProvider@6d3af739
java.lang.NoSuchFieldError: READ_ONLY
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.graal.phases.TornadoDataflowA
nalysis.processUsages(TornadoDataflowAnalysis.java:248)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.graal.phases.TornadoDataflowA
nalysis.run(TornadoDataflowAnalysis.java:72)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.graal.phases.TornadoDataflowA
nalysis.run(TornadoDataflowAnalysis.java:60)
        at jdk.internal.vm.compiler/org.graalvm.compiler.phases.BasePhase.apply(BasePhase.java:446)
        at jdk.internal.vm.compiler/org.graalvm.compiler.phases.BasePhase.apply(BasePhase.java:334)
        at jdk.internal.vm.compiler/org.graalvm.compiler.phases.PhaseSuite.run(PhaseSuite.java:390)
        at jdk.internal.vm.compiler/org.graalvm.compiler.phases.BasePhase.apply(BasePhase.java:446)
        at jdk.internal.vm.compiler/org.graalvm.compiler.phases.BasePhase.apply(BasePhase.java:334)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher.buil
dSketch(TornadoSketcher.java:226)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher$Torn
adoSketcherCallable.call(TornadoSketcher.java:186)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher$Torn
adoSketcherCallable.call(TornadoSketcher.java:176)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        at java.base/java.lang.Thread.run(Thread.java:833)
java.util.concurrent.ExecutionException: uk.ac.manchester.tornado.api.exceptions.TornadoBailoutRunti
meException: Unable to build sketch for method: calcLowpassAtRB(READ_ONLY)
        at java.base/java.util.concurrent.FutureTask.report(FutureTask.java:122)
        at java.base/java.util.concurrent.FutureTask.get(FutureTask.java:191)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher.look
up(TornadoSketcher.java:154)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.addInn
er(TornadoTaskGraph.java:571)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.addInn
er(TornadoTaskGraph.java:1935)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.addTas
k(TornadoTaskGraph.java:2022)
        at tornado.api@0.15/uk.ac.manchester.tornado.api.TaskGraph.task(TaskGraph.java:271)
        at Test@0.1-SNAPSHOT/test.App.main(App.java:27)
Caused by: uk.ac.manchester.tornado.api.exceptions.TornadoBailoutRuntimeException: Unable to build s
ketch for method: calcLowpassAtRB(READ_ONLY)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher.buil
dSketch(TornadoSketcher.java:254)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher$Torn
adoSketcherCallable.call(TornadoSketcher.java:186)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher$Torn
adoSketcherCallable.call(TornadoSketcher.java:176)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        at java.base/java.lang.Thread.run(Thread.java:833)
uk.ac.manchester.tornado.api.exceptions.TornadoBailoutRuntimeException: Unable to build sketch for m
ethod: calcLowpassAtRB(READ_ONLY)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher.buil
dSketch(TornadoSketcher.java:254)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher$Torn
adoSketcherCallable.call(TornadoSketcher.java:186)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.sketcher.TornadoSketcher$Torn
adoSketcherCallable.call(TornadoSketcher.java:176)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1136)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:635)
        at java.base/java.lang.Thread.run(Thread.java:833)
Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: Index -1 out of bounds for leng
th 256
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.graph.TornadoGraph.getNode(To
rnadoGraph.java:50)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.graph.TornadoGraphBuilder.bui
ldGraph(TornadoGraphBuilder.java:231)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.compil
e(TornadoTaskGraph.java:626)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.compil
eToTornadoVMBytecode(TornadoTaskGraph.java:695)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.schedu
leInner(TornadoTaskGraph.java:791)
        at tornado.runtime@0.15.1-dev/uk.ac.manchester.tornado.runtime.tasks.TornadoTaskGraph.schedu
le(TornadoTaskGraph.java:1213)
        at tornado.api@0.15/uk.ac.manchester.tornado.api.TaskGraph.execute(TaskGraph.java:782)
        at tornado.api@0.15/uk.ac.manchester.tornado.api.ImmutableTaskGraph.execute(ImmutableTaskGra
ph.java:73)
        at java.base/java.util.ArrayList.forEach(ArrayList.java:1511)
        at tornado.api@0.15/uk.ac.manchester.tornado.api.TornadoExecutionPlan$TornadoExecutor.execut
e(TornadoExecutionPlan.java:297)
        at tornado.api@0.15/uk.ac.manchester.tornado.api.TornadoExecutionPlan.execute(TornadoExecuti
onPlan.java:126)
        at Test@0.1-SNAPSHOT/test.App.main(App.java:37)
cleanup: programs  ..........0,000003700 s
cleanup: queues    ..........0,000529200 s
cleanup: context   ..........0,000041600 s
cleanup: total     ..........0,000574500 s
cleanup: programs  ..........0,000000600 s
cleanup: queues    ..........0,000058200 s
cleanup: context   ..........0,012598100 s
cleanup: total     ..........0,012656900 s

@jjfumero
Copy link
Member

Thank you. We will take a look internally and let you know.

@jjfumero
Copy link
Member

I took a look at this. The issue is related to the basic block traversal and the generation of OpenCL C code from the unstructured Control Flow Graph in the JIT compiler. This is not a problem when for the PTX and SPIR-V backends because we can jump to the same basic block from different ones in the generated assembly code. However, we can't do this for OpenCL.

From my view, the TornadoVM JIT compiler should add a new compiler phase for block-reordering and cloning conditional expressions. I will discuss this internally and provide a fix. In the meantime, this works:

    public static void calcLowpassAtRB(float[] samples, float[] lowPassAtRB, int width, int height, int green1Idx, int green2Idx) {
        for (@Parallel int x = 0; x < width; x++) {
            for (@Parallel int y = 0; y < height; y++) {
                int patternIdx = (x % 2) + 2 * (y % 2);
                boolean a = patternIdx == green1Idx;
                boolean b = patternIdx == green2Idx;
               // each condition in a separated branch.
                if (a) {
                    lowPassAtRB[x + y * width] = 0.0f;
                } else if (b) {
                    lowPassAtRB[x + y * width] = 0.0f;
                } else {
                    // 3x3 low-pass filter

                    float middle = samples[x + y * width];

                    float top = samples[x + TornadoMath.max(0, y - 1) * width];
                    float bottom = samples[x + TornadoMath.min(height - 1, y + 1) * width];
                    float left = samples[TornadoMath.max(0, x - 1) + y * width];
                    float right = samples[TornadoMath.min(width - 1, x + 1) + y * width];

                    float topLeft = samples[TornadoMath.max(0, x - 1) + TornadoMath.max(0, y - 1) * width];
                    float topRight = samples[TornadoMath.min(width - 1, x + 1) + TornadoMath.max(0, y - 1) * width];
                    float bottomLeft = samples[TornadoMath.max(0, x - 1) + TornadoMath.min(height - 1, y + 1) * width];
                    float bottomRight = samples[TornadoMath.min(width - 1, x + 1) + TornadoMath.min(height - 1, y + 1) * width];

                    lowPassAtRB[x + y * width] = 0.25f * middle + //
                            0.125f * (top + bottom + left + right) + //
                            0.0625f * (topLeft + topRight + bottomLeft + bottomRight);
                }
            }
        }
    }

@fkellner
Copy link
Author

fkellner commented May 1, 2023

Can confirm that it works, thank you very much for looking into it!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants