arange can get OptOps.GROUP #4293

chaosagent · 2024-04-25T09:19:59Z

From hand_coded. Not really consequential since the hand_coded only triggers for reduces below 2048, but illustrative of a class of problems where a low-level optimization is available but requires something to be passed through higher-level opts down the stack to it intact.

(venv) [david@thinktop tinygrad]$ DEBUG=4 python test/test_arange.py
opening device METAL from pid:88195
opening device HSA from pid:88195
  0 ━┳ STORE MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))
  1  ┗━┳ ADD 
  2    ┣━┳ SUM ((1,), dtypes.int)
  3    ┃ ┗━━ CONST ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(257, 511), strides=(0, 0), offset=0, mask=((0, 257), (255, 511)), contiguous=False), View(shape=(256, 256), strides=(1, 512), offset=0, mask=None, contiguous=False))))
  4    ┗━━ CONST ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)))
((LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(257, 511), strides=(0, 0), offset=0, mask=((0, 257), (255, 511)), contiguous=False), View(shape=(256, 256), strides=(1, 512), offset=0, mask=None, contiguous=False))))),), arg=((1,), dtypes.int)), LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))),), [Opt(op=OptOps.GROUPTOP, axis=0, amt=16)])
simplified 1 PHI and 1 WHERE in loop
reduced UOp count from 37 to 35
reduced UOp count from 35 to 33
reduced UOp count from 33 to 31
reduced UOp count from 31 to 28
#define INFINITY (__builtin_inff())
#define NAN (__builtin_nanf(""))
typedef long unsigned int size_t;
typedef float float2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) float2 make_float2(float x, float y) { return {x, y}; }
typedef float float4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { return {x, y, z, w}; }
typedef float float8 __attribute__((ext_vector_type(8)));
static inline __attribute__((device)) float8 make_float8(float x, float y, float z, float w, float a, float b, float c, float d) { return {x, y, z, w, a, b, c, d}; }
typedef signed int int4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { return {x, y, z, w}; }
typedef signed int int2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { return {x, y}; }
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
extern "C" {
  __attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float);
  __attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float);
  __attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float);
  __attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float);
  __attribute__((device)) float __ocml_sin_f32(float);
  __attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double);
  __attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double);
  __attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double);
  __attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double);
  __attribute__((device)) double __ocml_sin_f64(double);
  __attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
  __attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
  __attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
  __attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
  __attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
}
extern "C" __attribute__((global))void __attribute__((amdgpu_flat_work_group_size(1, 16)))r_256_16_16(int* data0) {
  __attribute__((shared))int temp[16];
  int gidx0 = __ockl_get_group_id(0); /* 256 */
  int lidx1 = __ockl_get_local_id(0); /* 16 */
  int alu0 = __ocml_fmax_f32((gidx0+(lidx1*16)+(-239)),0);
  int alu1 = __ocml_fmax_f32((-16),(-alu0));
  temp[lidx1] = (-alu1);
  __builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");__builtin_amdgcn_s_barrier();__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
  if ((lidx1<1)) {
    int acc0 = 0;
    for (int ridx0 = 0; ridx0 < 16; ridx0++) {
      int val0 = temp[ridx0];
      acc0 = (val0+acc0);
    }
    data0[gidx0] = (acc0+(-1));
  }
}
*** HSA        1 r_256_16_16                            arg   1 mem  0.00 GB tm    175.02us/     0.18ms (    0.59 GFLOPS,    0.01 GB/s)
  0 ━┳ STORE MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))
  1  ┗━┳ ADD 
  2    ┣━┳ SUM ((1,), dtypes.int)
  3    ┃ ┗━━ CONST ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2561, 5119), strides=(0, 0), offset=0, mask=((0, 2561), (2559, 5119)), contiguous=False), View(shape=(2560, 2560), strides=(1, 5120), offset=0, mask=None, contiguous=False))))
  4    ┗━━ CONST ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)))
float4 merging axis : [(2, 2, 0, 4)]
((LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2561, 5119), strides=(0, 0), offset=0, mask=((0, 2561), (2559, 5119)), contiguous=False), View(shape=(2560, 2560), strides=(1, 5120), offset=0, mask=None, contiguous=False))))),), arg=((1,), dtypes.int)), LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))),), [Opt(op=OptOps.UPCAST, axis=0, amt=4), Opt(op=OptOps.UNROLL, axis=0, amt=4), Opt(op=OptOps.LOCAL, axis=0, amt=32)])
simplified 4 PHI and 7 WHERE in loop
reduced UOp count from 87 to 76
reduced UOp count from 76 to 68
reduced UOp count from 68 to 67
reduced UOp count from 67 to 66
reduced UOp count from 66 to 65
#define INFINITY (__builtin_inff())
#define NAN (__builtin_nanf(""))
typedef long unsigned int size_t;
typedef float float2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) float2 make_float2(float x, float y) { return {x, y}; }
typedef float float4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { return {x, y, z, w}; }
typedef float float8 __attribute__((ext_vector_type(8)));
static inline __attribute__((device)) float8 make_float8(float x, float y, float z, float w, float a, float b, float c, float d) { return {x, y, z, w, a, b, c, d}; }
typedef signed int int4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { return {x, y, z, w}; }
typedef signed int int2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { return {x, y}; }
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
extern "C" {
  __attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float);
  __attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float);
  __attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float);
  __attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float);
  __attribute__((device)) float __ocml_sin_f32(float);
  __attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double);
  __attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double);
  __attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double);
  __attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double);
  __attribute__((device)) double __ocml_sin_f64(double);
  __attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
  __attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
  __attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
  __attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
  __attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
}
extern "C" __attribute__((global))void __attribute__((amdgpu_flat_work_group_size(1, 32)))r_20_32_640_4_4(int* data0) {
  int gidx0 = __ockl_get_group_id(0); /* 20 */
  int lidx1 = __ockl_get_local_id(0); /* 32 */
  int alu0 = ((gidx0*(-128))+(lidx1*(-4)));
  int alu1 = ((gidx0*128)+(lidx1*4));
  int alu2 = (((alu0+2558)/4)*(-1));
  int alu3 = (((alu0+2557)/4)*(-1));
  int alu4 = (alu3+639);
  int alu5 = (((alu0+2556)/4)*(-1));
  int alu6 = (alu5+639);
  int alu7 = ((((alu0+2559)/4)*(-1))+640);
  int alu8 = (alu2+640);
  int alu9 = (alu3+640);
  data0[alu1] = (alu7+alu6+alu4+alu2+639+(-1));
  data0[alu1+1] = (alu8+alu7+alu6+alu4+(-1));
  data0[alu1+2] = (alu9+alu8+alu7+alu6+(-1));
  data0[alu1+3] = (alu5+640+alu9+alu8+alu7+(-1));
}
*** HSA        1 r_20_32_640_4_4                        arg   1 mem  0.00 GB tm     19.29us/     0.02ms (    1.46 GFLOPS,    0.53 GB/s)
f1=102400, f2=28160
F
======================================================================
FAIL: test_complexity (__main__.TestArange.test_complexity)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/david/code/tinygrad/test/test_arange.py", line 14, in test_complexity
    assert 5 < f2 / f1 < 15, f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
           ^^^^^^^^^^^^^^^^
AssertionError: bad complexity, flops 0.3X while inputs 10X

----------------------------------------------------------------------
Ran 1 test in 0.186s

FAILED (failures=1)
avg:     1.46 GFLOPS     0.53 GB/s           total:     1 kernels     0.00 GOPS     0.00 GB     0.02 ms

The text was updated successfully, but these errors were encountered:

chaosagent · 2024-04-25T09:20:14Z

cc @patosai

patosai · 2024-04-25T16:42:25Z

faced a similar issue with one of my previous o(n) embedding approaches and had a somewhat nice solution in that args were added to the lazyops, and if the embedding arg was there then the reduce heuristic (in this case the linearizer optimization) wouldn't run

chenyuxyz · 2024-05-18T17:39:42Z

tested on master after uop refactor, beam can get rid of hcopt issue (say 16 v.s. 160). i think the solution is to have a light enough search as default

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

arange can get OptOps.GROUP #4293

arange can get OptOps.GROUP #4293

chaosagent commented Apr 25, 2024

chaosagent commented Apr 25, 2024

patosai commented Apr 25, 2024

chenyuxyz commented May 18, 2024

arange can get OptOps.GROUP #4293

arange can get OptOps.GROUP #4293

Comments

chaosagent commented Apr 25, 2024

chaosagent commented Apr 25, 2024

patosai commented Apr 25, 2024

chenyuxyz commented May 18, 2024