You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
From hand_coded. Not really consequential since the hand_coded only triggers for reduces below 2048, but illustrative of a class of problems where a low-level optimization is available but requires something to be passed through higher-level opts down the stack to it intact.
(venv) [david@thinktop tinygrad]$ DEBUG=4 python test/test_arange.py
opening device METAL from pid:88195
opening device HSA from pid:88195
0 ━┳ STORE MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))
1 ┗━┳ ADD
2 ┣━┳ SUM ((1,), dtypes.int)
3 ┃ ┗━━ CONST ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(257, 511), strides=(0, 0), offset=0, mask=((0, 257), (255, 511)), contiguous=False), View(shape=(256, 256), strides=(1, 512), offset=0, mask=None, contiguous=False))))
4 ┗━━ CONST ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)))
((LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(257, 511), strides=(0, 0), offset=0, mask=((0, 257), (255, 511)), contiguous=False), View(shape=(256, 256), strides=(1, 512), offset=0, mask=None, contiguous=False))))),), arg=((1,), dtypes.int)), LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(256, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))),), [Opt(op=OptOps.GROUPTOP, axis=0, amt=16)])
simplified 1 PHI and 1 WHERE in loop
reduced UOp count from 37 to 35
reduced UOp count from 35 to 33
reduced UOp count from 33 to 31
reduced UOp count from 31 to 28
#define INFINITY (__builtin_inff())
#define NAN (__builtin_nanf(""))
typedef long unsigned int size_t;
typedef float float2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) float2 make_float2(float x, float y) { return {x, y}; }
typedef float float4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { return {x, y, z, w}; }
typedef float float8 __attribute__((ext_vector_type(8)));
static inline __attribute__((device)) float8 make_float8(float x, float y, float z, float w, float a, float b, float c, float d) { return {x, y, z, w, a, b, c, d}; }
typedef signed int int4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { return {x, y, z, w}; }
typedef signed int int2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { return {x, y}; }
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
extern "C" {
__attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float);
__attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float);
__attribute__((device)) float __ocml_sin_f32(float);
__attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double);
__attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double);
__attribute__((device)) double __ocml_sin_f64(double);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
__attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
}
extern "C" __attribute__((global))void __attribute__((amdgpu_flat_work_group_size(1, 16)))r_256_16_16(int* data0) {
__attribute__((shared))int temp[16];
int gidx0 = __ockl_get_group_id(0); /* 256 */
int lidx1 = __ockl_get_local_id(0); /* 16 */
int alu0 = __ocml_fmax_f32((gidx0+(lidx1*16)+(-239)),0);
int alu1 = __ocml_fmax_f32((-16),(-alu0));
temp[lidx1] = (-alu1);
__builtin_amdgcn_fence(__ATOMIC_RELEASE, "workgroup");__builtin_amdgcn_s_barrier();__builtin_amdgcn_fence(__ATOMIC_ACQUIRE, "workgroup");
if ((lidx1<1)) {
int acc0 = 0;
for (int ridx0 = 0; ridx0 < 16; ridx0++) {
int val0 = temp[ridx0];
acc0 = (val0+acc0);
}
data0[gidx0] = (acc0+(-1));
}
}
*** HSA 1 r_256_16_16 arg 1 mem 0.00 GB tm 175.02us/ 0.18ms ( 0.59 GFLOPS, 0.01 GB/s)
0 ━┳ STORE MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))
1 ┗━┳ ADD
2 ┣━┳ SUM ((1,), dtypes.int)
3 ┃ ┗━━ CONST ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2561, 5119), strides=(0, 0), offset=0, mask=((0, 2561), (2559, 5119)), contiguous=False), View(shape=(2560, 2560), strides=(1, 5120), offset=0, mask=None, contiguous=False))))
4 ┗━━ CONST ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),)))
float4 merging axis : [(2, 2, 0, 4)]
((LazyOp(op=BufferOps.STORE, src=(LazyOp(op=BinaryOps.ADD, src=(LazyOp(op=ReduceOps.SUM, src=(LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2561, 5119), strides=(0, 0), offset=0, mask=((0, 2561), (2559, 5119)), contiguous=False), View(shape=(2560, 2560), strides=(1, 5120), offset=0, mask=None, contiguous=False))))),), arg=((1,), dtypes.int)), LazyOp(op=BufferOps.CONST, src=(), arg=ConstBuffer(val=-1, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(0, 0), offset=0, mask=None, contiguous=False),))))), arg=None),), arg=MemBuffer(idx=0, dtype=dtypes.int, st=ShapeTracker(views=(View(shape=(2560, 1), strides=(1, 0), offset=0, mask=None, contiguous=True),)))),), [Opt(op=OptOps.UPCAST, axis=0, amt=4), Opt(op=OptOps.UNROLL, axis=0, amt=4), Opt(op=OptOps.LOCAL, axis=0, amt=32)])
simplified 4 PHI and 7 WHERE in loop
reduced UOp count from 87 to 76
reduced UOp count from 76 to 68
reduced UOp count from 68 to 67
reduced UOp count from 67 to 66
reduced UOp count from 66 to 65
#define INFINITY (__builtin_inff())
#define NAN (__builtin_nanf(""))
typedef long unsigned int size_t;
typedef float float2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) float2 make_float2(float x, float y) { return {x, y}; }
typedef float float4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) float4 make_float4(float x, float y, float z, float w) { return {x, y, z, w}; }
typedef float float8 __attribute__((ext_vector_type(8)));
static inline __attribute__((device)) float8 make_float8(float x, float y, float z, float w, float a, float b, float c, float d) { return {x, y, z, w, a, b, c, d}; }
typedef signed int int4 __attribute__((ext_vector_type(4)));
static inline __attribute__((device)) int4 make_int4(signed int x, signed int y, signed int z, signed int w) { return {x, y, z, w}; }
typedef signed int int2 __attribute__((ext_vector_type(2)));
static inline __attribute__((device)) int2 make_int2(signed int x, signed int y) { return {x, y}; }
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_group_id(unsigned int);
extern "C" __attribute__((device)) __attribute__((const)) size_t __ockl_get_local_size(unsigned int);
extern "C" {
__attribute__((device)) __attribute__((const)) float __ocml_fmax_f32(float, float);
__attribute__((device)) __attribute__((pure)) float __ocml_exp2_f32(float);
__attribute__((device)) __attribute__((pure)) float __ocml_log2_f32(float);
__attribute__((device)) __attribute__((const)) float __ocml_sqrt_f32(float);
__attribute__((device)) float __ocml_sin_f32(float);
__attribute__((device)) __attribute__((const)) double __ocml_fmax_f64(double, double);
__attribute__((device)) __attribute__((pure)) double __ocml_exp2_f64(double);
__attribute__((device)) __attribute__((pure)) double __ocml_log2_f64(double);
__attribute__((device)) __attribute__((const)) double __ocml_sqrt_f64(double);
__attribute__((device)) double __ocml_sin_f64(double);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
__attribute__((device)) __attribute__((pure)) _Float16 __ocml_log2_f16(_Float16);
__attribute__((device)) __attribute__((const)) _Float16 __ocml_sqrt_f16(_Float16);
__attribute__((device)) _Float16 __ocml_sin_f16(_Float16);
}
extern "C" __attribute__((global))void __attribute__((amdgpu_flat_work_group_size(1, 32)))r_20_32_640_4_4(int* data0) {
int gidx0 = __ockl_get_group_id(0); /* 20 */
int lidx1 = __ockl_get_local_id(0); /* 32 */
int alu0 = ((gidx0*(-128))+(lidx1*(-4)));
int alu1 = ((gidx0*128)+(lidx1*4));
int alu2 = (((alu0+2558)/4)*(-1));
int alu3 = (((alu0+2557)/4)*(-1));
int alu4 = (alu3+639);
int alu5 = (((alu0+2556)/4)*(-1));
int alu6 = (alu5+639);
int alu7 = ((((alu0+2559)/4)*(-1))+640);
int alu8 = (alu2+640);
int alu9 = (alu3+640);
data0[alu1] = (alu7+alu6+alu4+alu2+639+(-1));
data0[alu1+1] = (alu8+alu7+alu6+alu4+(-1));
data0[alu1+2] = (alu9+alu8+alu7+alu6+(-1));
data0[alu1+3] = (alu5+640+alu9+alu8+alu7+(-1));
}
*** HSA 1 r_20_32_640_4_4 arg 1 mem 0.00 GB tm 19.29us/ 0.02ms ( 1.46 GFLOPS, 0.53 GB/s)
f1=102400, f2=28160
F
======================================================================
FAIL: test_complexity (__main__.TestArange.test_complexity)
----------------------------------------------------------------------
Traceback (most recent call last):
File "/home/david/code/tinygrad/test/test_arange.py", line 14, in test_complexity
assert 5 < f2 / f1 < 15, f"bad complexity, flops {f2/f1:.1f}X while inputs 10X"
^^^^^^^^^^^^^^^^
AssertionError: bad complexity, flops 0.3X while inputs 10X
----------------------------------------------------------------------
Ran 1 test in 0.186s
FAILED (failures=1)
avg: 1.46 GFLOPS 0.53 GB/s total: 1 kernels 0.00 GOPS 0.00 GB 0.02 ms
The text was updated successfully, but these errors were encountered:
faced a similar issue with one of my previous o(n) embedding approaches and had a somewhat nice solution in that args were added to the lazyops, and if the embedding arg was there then the reduce heuristic (in this case the linearizer optimization) wouldn't run
tested on master after uop refactor, beam can get rid of hcopt issue (say 16 v.s. 160). i think the solution is to have a light enough search as default
From hand_coded. Not really consequential since the hand_coded only triggers for reduces below 2048, but illustrative of a class of problems where a low-level optimization is available but requires something to be passed through higher-level opts down the stack to it intact.
The text was updated successfully, but these errors were encountered: