//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-29618528
// Cuda compilation tools, release 11.2, V11.2.152
// Based on NVVM 7.0.1
//

.version 7.2
.target sm_61
.address_size 64

	// .globl	block_sum
.extern .shared .align 4 .b8 smem[];

.visible .entry block_sum(
	.param .u64 block_sum_param_0,
	.param .u64 block_sum_param_1,
	.param .u32 block_sum_param_2
)
{
	.reg .pred 	%p<20>;
	.reg .f32 	%f<33>;
	.reg .b32 	%r<13>;
	.reg .b64 	%rd<9>;


	ld.param.u64 	%rd1, [block_sum_param_0];
	ld.param.u64 	%rd2, [block_sum_param_1];
	ld.param.u32 	%r6, [block_sum_param_2];
	mov.u32 	%r1, %ntid.x;
	mov.u32 	%r2, %ctaid.x;
	mov.u32 	%r3, %tid.x;
	mad.lo.s32 	%r4, %r2, %r1, %r3;
	setp.ge.u32 	%p1, %r4, %r6;
	@%p1 bra 	LBB0_21;

	cvta.to.global.u64 	%rd3, %rd1;
	mul.wide.u32 	%rd4, %r4, 4;
	add.s64 	%rd5, %rd3, %rd4;
	ld.global.f32 	%f1, [%rd5];
	shl.b32 	%r7, %r3, 2;
	mov.u32 	%r8, smem;
	add.s32 	%r5, %r8, %r7;
	st.shared.f32 	[%r5], %f1;
	bar.sync 	0;
	setp.ne.s32 	%p2, %r1, 1024;
	@%p2 bra 	LBB0_5;

	setp.gt.u32 	%p3, %r3, 511;
	add.s32 	%r9, %r4, 512;
	setp.ge.u32 	%p4, %r9, %r6;
	or.pred  	%p5, %p3, %p4;
	@%p5 bra 	LBB0_4;

	ld.shared.f32 	%f2, [%r5];
	ld.shared.f32 	%f3, [%r5+2048];
	add.f32 	%f4, %f3, %f2;
	st.shared.f32 	[%r5], %f4;

LBB0_4:
	bar.sync 	0;

LBB0_5:
	setp.lt.u32 	%p6, %r1, 512;
	@%p6 bra 	LBB0_9;

	setp.gt.u32 	%p7, %r3, 255;
	add.s32 	%r10, %r4, 256;
	setp.ge.u32 	%p8, %r10, %r6;
	or.pred  	%p9, %p7, %p8;
	@%p9 bra 	LBB0_8;

	ld.shared.f32 	%f5, [%r5];
	ld.shared.f32 	%f6, [%r5+1024];
	add.f32 	%f7, %f6, %f5;
	st.shared.f32 	[%r5], %f7;

LBB0_8:
	bar.sync 	0;

LBB0_9:
	setp.lt.u32 	%p10, %r1, 256;
	@%p10 bra 	LBB0_13;

	setp.gt.u32 	%p11, %r3, 127;
	add.s32 	%r11, %r4, 128;
	setp.ge.u32 	%p12, %r11, %r6;
	or.pred  	%p13, %p11, %p12;
	@%p13 bra 	LBB0_12;

	ld.shared.f32 	%f8, [%r5];
	ld.shared.f32 	%f9, [%r5+512];
	add.f32 	%f10, %f9, %f8;
	st.shared.f32 	[%r5], %f10;

LBB0_12:
	bar.sync 	0;

LBB0_13:
	setp.lt.u32 	%p14, %r1, 128;
	@%p14 bra 	LBB0_17;

	setp.gt.u32 	%p15, %r3, 63;
	add.s32 	%r12, %r4, 64;
	setp.ge.u32 	%p16, %r12, %r6;
	or.pred  	%p17, %p15, %p16;
	@%p17 bra 	LBB0_16;

	ld.shared.f32 	%f11, [%r5];
	ld.shared.f32 	%f12, [%r5+256];
	add.f32 	%f13, %f12, %f11;
	st.shared.f32 	[%r5], %f13;

LBB0_16:
	bar.sync 	0;

LBB0_17:
	setp.gt.u32 	%p18, %r3, 31;
	@%p18 bra 	LBB0_19;

	ld.shared.f32 	%f14, [%r5];
	ld.shared.f32 	%f15, [%r5+128];
	add.f32 	%f16, %f15, %f14;
	st.shared.f32 	[%r5], %f16;
	bar.warp.sync 	-1;
	ld.shared.f32 	%f17, [%r5+64];
	ld.shared.f32 	%f18, [%r5];
	add.f32 	%f19, %f17, %f18;
	st.shared.f32 	[%r5], %f19;
	bar.warp.sync 	-1;
	ld.shared.f32 	%f20, [%r5+32];
	ld.shared.f32 	%f21, [%r5];
	add.f32 	%f22, %f20, %f21;
	st.shared.f32 	[%r5], %f22;
	bar.warp.sync 	-1;
	ld.shared.f32 	%f23, [%r5+16];
	ld.shared.f32 	%f24, [%r5];
	add.f32 	%f25, %f23, %f24;
	st.shared.f32 	[%r5], %f25;
	bar.warp.sync 	-1;
	ld.shared.f32 	%f26, [%r5+8];
	ld.shared.f32 	%f27, [%r5];
	add.f32 	%f28, %f26, %f27;
	st.shared.f32 	[%r5], %f28;
	bar.warp.sync 	-1;
	ld.shared.f32 	%f29, [%r5+4];
	ld.shared.f32 	%f30, [%r5];
	add.f32 	%f31, %f29, %f30;
	st.shared.f32 	[%r5], %f31;

LBB0_19:
	setp.ne.s32 	%p19, %r3, 0;
	@%p19 bra 	LBB0_21;

	ld.shared.f32 	%f32, [smem];
	cvta.to.global.u64 	%rd6, %rd2;
	mul.wide.u32 	%rd7, %r2, 4;
	add.s64 	%rd8, %rd6, %rd7;
	st.global.f32 	[%rd8], %f32;

LBB0_21:
	ret;

}

