Skip to content

Statistics

Jesús Carabaño edited this page Mar 1, 2017 · 6 revisions

This script computes a series of statistical values on the input raster. Some of the summarized statistics are the maximum, minumum, std and different types of means, e.g. the harmonic mean. These operations are of type Zonal, which means the reduce a values along some defined zone. In this case we are not giving explicit zones, which means the operations work in the whole raster. This script is simple but interesting because it presents opportunities for fusion, some fusion-preventing dependencies and scalar operations are better run by a scalar thread rather than in OpenCL.

	from map import * ## "Parallel Map Algebra" package

	dem = read('in_file_path')
	N = prod(dem.datasize())

	maxv = zmax(dem)
	minv = zmin(dem)
	rang = maxv - minv

	mean = zsum(dem) / N
	geom = zprod(dem) ** (1.0/N)
	harm = N / zsum(1.0/dem)
	quam = sqrt(zsum(dem**2)/N)
	cubm = cbrt(zsum(dem**3)/N)

	dev = (dem - mean) ** 2
	var = zsum(dev) / N
	vstd = sqrt(var)

	norm = (dev - minv) / rang
	nstd  = sqrt(zsum(norm) / N)

	print "max:  ", value(maxv), "min:  ", value(minv), "range: ", value(rang)
	print "mean: ", value(mean), "quam: ", value(quam), "cubm:  ", value(cubm)
	print "vstd: ", value(vstd), "nstd: ", value(nstd)
	print "geom: ", value(geom), "harm: ", value(harm)

The actual source file for the statistics script can be found here.

Graph of Dependencies

Below is the dependency graph showing six different groups of nodes (grouped by color). This organization differs from Hillshade, where all the nodes are seamlessly fused into one group. Moreover, only two groups presents two-dimensional operations (pink,red), while the other four groups work on scalar values. As a result those four scalar groups don't really need any OpenCL code, since they don't present data parallelism.

OpenCL Kernel Code

Here is the kernel code for the first non-scalar group. The reductions are performed completely in GPU memory, thus the use of atomic operations to finally reduce the values from shared memory to global memory.

kernel void krn2477221412142453674
(
	global float *IN_34, float IN_34v, uchar IN_34f,
	float IN_44v,
	int IN_48v,
	int IN_53v,
	global float *OUT_35, int idx_35,
	global float *OUT_36, int idx_36,
	global float *OUT_38, int idx_38,
	global float *OUT_41, int idx_41,
	global float *OUT_46, int idx_46,
	global float *OUT_50, int idx_50,
	global float *OUT_55, int idx_55,
	const int BS0,
	const int BS1,
	const int BC0,
	const int BC1,
	const int GS0,
	const int GS1,
	const int GN0,
	const int GN1
)
{
	float F32_34, F32_44, F32_35, F32_36, F32_38, F32_41, F32_45, F32_46, F32_49, F32_50, F32_54, F32_55;
	int S32_48, S32_53;
	local float F32S_35[256];
	local float F32S_36[256];
	local float F32S_38[256];
	local float F32S_41[256];
	local float F32S_46[256];
	local float F32S_50[256];
	local float F32S_55[256];
	
	int gc0 = get_local_id(0);
	int gc1 = get_local_id(1);
	int GC0 = get_group_id(0);
	int GC1 = get_group_id(1);
	int bc0 = get_global_id(0);
	int bc1 = get_global_id(1);
	
	// Previous to ZONAL core

	if (bc0 < BS0 && bc1 < BS1)
	{
		F32_34 = LOAD_L_F32(IN_34);
		F32_44 = IN_44v;
		S32_48 = IN_48v;
		S32_53 = IN_53v;
		F32_45 = F32_44 / F32_34;
		F32_49 = pow(F32_34,S32_48);
		F32_54 = pow(F32_34,S32_53);
		F32S_35[(gc1)*GS0+gc0] = F32_34;
		F32S_36[(gc1)*GS0+gc0] = F32_34;
		F32S_38[(gc1)*GS0+gc0] = F32_34;
		F32S_41[(gc1)*GS0+gc0] = F32_34;
		F32S_46[(gc1)*GS0+gc0] = F32_45;
		F32S_50[(gc1)*GS0+gc0] = F32_49;
		F32S_55[(gc1)*GS0+gc0] = F32_54;
	}
	else
	{
		F32S_35[(gc1)*GS0+gc0] = -INFINITY;
		F32S_36[(gc1)*GS0+gc0] = INFINITY;
		F32S_38[(gc1)*GS0+gc0] = 0;
		F32S_41[(gc1)*GS0+gc0] = 1;
		F32S_46[(gc1)*GS0+gc0] = 0;
		F32S_50[(gc1)*GS0+gc0] = 0;
		F32S_55[(gc1)*GS0+gc0] = 0;
	}
	
	// Zonal core

	for (int i=GS0*GS1/2; i>0; i/=2) {
		barrier(CLK_LOCAL_MEM_FENCE);
		if ((gc1)*GS0+gc0 < i)
		{
			F32S_35[(gc1)*GS0+gc0] = max(F32S_35[(gc1)*GS0+gc0], F32S_35[(gc1)*GS0+gc0 + i]);
			F32S_36[(gc1)*GS0+gc0] = min(F32S_36[(gc1)*GS0+gc0], F32S_36[(gc1)*GS0+gc0 + i]);
			F32S_38[(gc1)*GS0+gc0] = F32S_38[(gc1)*GS0+gc0] + F32S_38[(gc1)*GS0+gc0 + i];
			F32S_41[(gc1)*GS0+gc0] = F32S_41[(gc1)*GS0+gc0] * F32S_41[(gc1)*GS0+gc0 + i];
			F32S_46[(gc1)*GS0+gc0] = F32S_46[(gc1)*GS0+gc0] + F32S_46[(gc1)*GS0+gc0 + i];
			F32S_50[(gc1)*GS0+gc0] = F32S_50[(gc1)*GS0+gc0] + F32S_50[(gc1)*GS0+gc0 + i];
			F32S_55[(gc1)*GS0+gc0] = F32S_55[(gc1)*GS0+gc0] + F32S_55[(gc1)*GS0+gc0 + i];
		}
	}
	
	if (gc0 == 0 && gc1 == 0)
	{
		atomicMAX( (global char*)OUT_35+idx_35 , F32S_35[(gc1)*GS0+gc0]);
		atomicMIN( (global char*)OUT_36+idx_36 , F32S_36[(gc1)*GS0+gc0]);
		atomicSUM( (global char*)OUT_38+idx_38 , F32S_38[(gc1)*GS0+gc0]);
		atomicPROD( (global char*)OUT_41+idx_41 , F32S_41[(gc1)*GS0+gc0]);
		atomicSUM( (global char*)OUT_46+idx_46 , F32S_46[(gc1)*GS0+gc0]);
		atomicSUM( (global char*)OUT_50+idx_50 , F32S_50[(gc1)*GS0+gc0]);
		atomicSUM( (global char*)OUT_55+idx_55 , F32S_55[(gc1)*GS0+gc0]);
	}
	
	// Posterior to ZONAL core

	if (bc0 < BS0 && bc1 < BS1)
	{
	}
}

Assembly code for CPU devices

This is a dump of the assembly code generated by the Intel OpenCL compiler for a Skylake i7-6700 CPU. See how the reduction instructions are vectorized (i.e. vmaxps, vminps, vaddps, vmulps) and are fused together into the same basic block.

	...
	vpcmpgtd	128(%rcx), %ymm0, %ymm2
	vptest	%ymm2, %ymm2
	je	.LBB1_23
	movq	168(%rcx), %rax
	vmovups	(%rax), %ymm3
	movl	160(%rcx), %esi
	addl	%r14d, %esi
	movslq	%esi, %rsi
-->	vmaxps	512(%rbx,%rsi,4), %ymm3, %ymm3
	vptest	%ymm1, %ymm2
	jae	.LBB1_22
	vmovups	%ymm3, (%rax)
	movq	176(%rcx), %rax
	vmovups	(%rax), %ymm2
-->	vminps	1536(%rbx,%rsi,4), %ymm2, %ymm2
	vmovups	%ymm2, (%rax)
	movq	184(%rcx), %rax
	vmovups	(%rax), %ymm2
-->	vaddps	2560(%rbx,%rsi,4), %ymm2, %ymm2
	vmovups	%ymm2, (%rax)
	movq	192(%rcx), %rax
	vmovups	(%rax), %ymm2
-->	vmulps	3584(%rbx,%rsi,4), %ymm2, %ymm2
	vmovups	%ymm2, (%rax)
	movq	200(%rcx), %rax
	vmovups	(%rax), %ymm2
-->	vaddps	4608(%rbx,%rsi,4), %ymm2, %ymm2
	vmovups	%ymm2, (%rax)
	movq	208(%rcx), %rax
	vmovups	(%rax), %ymm2
-->	vaddps	5632(%rbx,%rsi,4), %ymm2, %ymm2
	vmovups	%ymm2, (%rax)
	movq	216(%rcx), %rax
	vmovups	(%rax), %ymm2
-->	vaddps	6656(%rbx,%rsi,4), %ymm2, %ymm2
	vmovups	%ymm2, (%rax)
	jmp	.LBB1_23
	...