norlab-ulaval · LouisCastricato · Apr 12, 2015 · Apr 12, 2015 · Apr 12, 2015 · Apr 12, 2015
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -48,6 +48,21 @@ if(MSVC AND (MSVC_VERSION LESS 1600))
 	add_definitions(-DBOOST_STDINT)
 endif(MSVC AND (MSVC_VERSION LESS 1600))
 
+#CUDA
+set(USE_CUDA "false" CACHE BOOL "Set phasers to CUDA!")
+if(USE_CUDA)
+	#Sorry Linux support only right now :( I don't own a copy of Windows, so its a bit hard for me to make sure that it'll work on windows. -Louis
+	find_package(CUDA)
+
+	SET(CUDA_SEPARABLE_COMPILATION ON)
+	SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+	SET(CUDA_NVCC_FLAGS "-arch=compute_30;-code=sm_30;-Xcompiler=-fPIC;--shared;--use_fast_math;-G")
+
+	link_with(${CUDA_LIBRARIES})
+	message("CUDA has been enabled. Currently only KD_Node is supported in this build")
+endif(USE_CUDA)
+
 # openmp
 find_package(OpenMP)
 if (OPENMP_FOUND)

@@ -48,6 +48,38 @@ libnabo provides the following compilation options, available through [CMake]:
 You can specify them with a command-line tool, `ccmake`, or with a graphical tool, `cmake-gui`.
 Please read the [CMake documentation] for more information.
 
+In order to utilize CUDA, please install the latest NVIDIA developer driver, and atleast CUDA 7.0:
+
+  * For Ubuntu or Debian, follow [these instructions](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-linux/#axzz3X7AKITTy). 
+
+  * For Windows, follow [these instructions](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-microsoft-windows/#axzz3X7AKITTy). Please note that this code has, as of April 12th 2015, NOT been tested on Windows.
+
+  * For Mac OSX, follow [these instructions](http://docs.nvidia.com/cuda/cuda-getting-started-guide-for-mac-os-x/#axzz3X7AKITTy). Clang 6.0 is required.
+
+Compute Model 3.2 or better is required. Compute Model 3.5 or better is recommended. To see which compute model you have, click [here](https://developer.nvidia.com/cuda-gpus).
+
+CUDA Best Practices 
+-------------------
+
+Query points should be stored in a linearized array, and should be in sub groups of a multiple of 16. No more than 1024 groups should be queried at any given time.
+
+Subgroups should be ordered in regard to their distance from the center of the group in least to greatest. This will insure minimalized thread divergence and greatly improve performance. 
+
+The smaller the overall maximum distance from the center any given point within a subgroup is, the smaller the overall divergence. 
+
+If a path proves to be too divergent, a new kernel will be launched and the query point subgroup will be split. This is suboptimal when compared to a tighly packed subgroup, so avoid it at all costs. It is also only available on Compute Model 3.5. 
+
+Currently, in order to change the dimension could, the kernel must be recompiled. This primarily is due to the simplicity of removing point striding from the in-development version. It will be added at a later date.
+
+In order to call CUDA from libnabo, please create a C to C++ wrapper and properly invoke it from the search function within kdtree_opencl.cpp. I have not created a full C++ wrapper yet for the CUDA code; however, I have plans to eventually do so. 
+
+For optimal performance, restrict BucketSize to 10. Eventually a method to dynamically set BucketSize will be used, as it is highly dependent on the GPU being utilized for the computations.
+
+Double precision is NOT available and I have no intentions of adding it. This code is single precision only. If you wish to utilize doubles, modify it to your liking.
+
+When dynamic parralelism is finalized, tree depth will be limited to 22, as this is the standard limit for Compute Model 3.5. If NVIDIA plans to alleviate this limit for consumer GPUs in the near future, or a work around is discovered, I will remove this limit.
+
+While dynamic parralelism will be used for divergent paths, only 1024 paths will be allowed to launch new kernels at any given point. Therefore, if the number of paths that prove to be divergent is greater than 1024, please optimize or recluster your querry set. A path is considered to be divergent if any given query point is outside of the KNN max_rad2 of the center of the cluster. 
 Quick compilation and installation under Unix
 ---------------------------------------------
 

@@ -0,0 +1,234 @@
+//CUDA runtime for KD_nodes
+
+
+
+/*Roadmap: 
+
+			(We are here)
+ 			 |
+			\ /
+			 v
+Core functionality -> Dynamic Parralelism Experimentation -> Linking to existing libnabo framework -> optimization -> finalization
+
+/Optimization |= (Search key sorting, linearizing the KD tree, improving GPU caching for node heaps)/
+
+EST: Probably a month? Unsure. I need to look through the rest of the SDK.*/
+#define maximum_depth 22
+#define dim_count 3
+#define K_size 16
+#ifndef FLOAT_MAX 
+#define FLOAT_MAX 33554430.0f
+#endif 
+#define BLOCK_SIZE 32
+#define max_rad 256
+//If coordinates are within 5% of eachother when compared to their cluster maximimum, treat them as a single point. To be used later
+#define max_error 0.05f
+
+#define OFFSIDE 0
+#define ONSIDE 1
+#define POINT_STRIDE 3
+struct point
+{
+	float data[dim_count];
+};
+struct heap_entry
+{
+	float value;
+	unsigned char index;
+};
+struct stack_entry{
+	size_t n;
+	uint state;
+};
+
+__device__ float heapHeadValue(heap_entry* h)
+{
+	return h->value;
+}
+
+__device__ heap_entry* heapHeadReplace(heap_entry* h, const int index, const float value, const uint K)
+{
+	uint i = 0;
+	for (; i < K - 1; ++i)
+	{
+		if (h[i + 1].value > value)
+			h[i] = h[i + 1];
+		else
+			break;
+	}
+	h[i].value = value;
+	h[i].index = index;
+	return h;
+}
+
+__device__ heap_entry *heapInit(const uint K)
+{
+	heap_entry *h;
+	for (uint i = 0; i < K; ++i)
+		h[i].value = FLOAT_MAX;
+	return h;
+}
+struct kd_node
+{
+	//Which dimension
+	unsigned int dim;
+	//At what value was this node split?
+	int cutVal;
+	//The index of the current node
+	int index;
+};
+
+
+#define inx_size 12
+struct /*__align__(inx_size)*/ indx
+{
+	//The points of the KD tree
+	point *pts;
+	//The linked nodes
+	const kd_node *nodes;
+};
+
+//Just a utility function for converting an int equal to zero to one, and vice versa. Its not well optimized, but it was quick to write :P Would be better with bitshifts
+__device__ int flip(int in)
+{
+	return abs(in - 1);
+}
+__device__ unsigned int childLeft(const unsigned int pos) { return 2*pos + 1; }
+__device__ unsigned int childRight(const unsigned int pos) { return 2*pos + 2; }
+struct maxAB
+{
+	float A,B;
+	int indx_a, indx_b;
+};
+
+//Clamp the value to 1 or 0
+__device__ static float intensity(float a)
+{
+	return fmax(1,fmin(0,fabs(a)));
+}
+struct heap
+{
+	heap_entry *entries;
+	int current_count;
+};
+//If dynamic parrallelism is not available, default to compute model 3_2. Eg: The early 700 series
+#ifndef CM3_5
+#define CM3_2
+#endif
+//Used to see if we're within bounds and are ready to jump a node
+__device__ unsigned int withinBounds(int cd, point q, point p, float heapHeadVal, float maxRad, float maxError)
+{
+	float diff = q.data[cd] -p.data[cd];
+	float side2 = diff*diff;
+	if ((side2 <= maxRad) &&
+		(side2 * maxError < heapHeadVal))
+	{ 
+		return 1;
+	}
+	return 0;
+}
+//Used for warp devices. One if distance is greater than zero. Returns 0 or 1 
+__device__ unsigned int nodeMinor(int cd, point q, point p)
+{
+	float diff = q.data[cd] -p.data[cd];
+	return (unsigned int)intensity(diff); 
+
+}
+//Implementation details: http://on-demand.gputechconf.com/gtc/2012/presentations/S0079-Warped-Parallel-Nearest-Neighbor-Searches-Using-KD-Trees.pdf
+__device__ void recursive_warp_search(const indx static_data, const point query_point,  unsigned int _Mask, heap *output, 
+					uint stackpointer, stack_entry *stack, stack_entry *s)
+{
+	stackpointer--;
+	const size_t n = s->n;
+	const kd_node node = static_data.nodes[n];
+	const int cd = node.cutVal;
+	//Continue doesn't do anything anymore since we're in a __device__ function (Not __global__), and there is no while loop
+	/*if (cd == -2)
+		continue;*/
+	const int index = node.index;
+	point p = static_data.pts[index];
+	// compute new distance and update if lower
+	float dist = 0;
+	for (uint i = 0; i < dim_count; ++i)
+	{
+		const float diff = query_point.data[i] - p.data[i];
+		dist += diff * diff;
+	}
+	if ((dist <= max_rad) &&
+		(dist < heapHeadValue(output->entries)) &&
+		(dist > (float)max_error)){
+		output->entries = heapHeadReplace(output->entries, index, dist, K_size);output->current_count++;}
+		// look for recursion
+	//Let the warp group decide which way we want to travel next
+	_Mask = _Mask & __ballot(nodeMinor(cd, query_point,p));
+
+
+	//If side >= 0, then we branch right first
+	if(_Mask)
+	{
+		s->n = childRight(n);
+		recursive_warp_search(static_data, query_point,  _Mask, output, 
+					stackpointer, stack, s);
+		stackpointer++;
+		s = stack[stackpointer];
+		//This needs to be called before the __any, since the thread needs to be terminated before we conduct the next vote.
+		if(output->current_count > K_size)
+		{	
+			/*Exit the kernel if we have all of the points that we need. Since all of the points are clustered, hopefully this is greatly reduced and all threads exit
+			at near the same time*/
+			return;
+		}
+		//If any of the remaining active threads are within bounds of the left node
+		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)))
+		{
+			s->n = childLeft(n);
+			recursive_warp_search(static_data, query_point,  _Mask, output, 
+				stackpointer, stack, s);
+			stackpointer++;
+		}
+	}
+	//Otherwise we branch left
+	else
+	{
+		s->n = childLeft(n);
+		recursive_warp_search(static_data, query_point,  _Mask, output, 
+					stackpointer, stack, s);
+		stackpointer++;
+		s = stack[stackpointer];
+		if(output->current_count > K_size)
+		{	
+			/*Exit the kernel if we have all of the points that we need. Since all of the points are clustered, hopefully this is greatly reduced and all threads exit
+			at near the same time*/
+			return;
+		}
+		//If any of the remaining active threads are within bounds of the right node
+		if(__any(withinBounds(cd,query_point, p, heapHeadValue(ouput->entries), max_rad, max_error)))
+		{
+			s->n = childRight(n);
+			recursive_warp_search(static_data, query_point,  _Mask, output, 
+				stackpointer, stack, s);
+			stackpointer++;
+		}
+	}
+	//TODO: Sort
+
+} 
+/*Kernel is to be executed as 32x1
+indx is pre malloced and copied to the GPU to avoid memory bottlenecks. Query points is copied per iteration.
+Uses a warped ballot system. Preferable for clustered points that are closely together.
+Make sure the thread group size is equal to the size of the cluster & is a multiple of 32*/
+__global__ void clustered_search(indx static_data, const point *query_points, int *indices,  heap *ret, int query_amt)
+{
+	stack_entry stack[maximum_depth];
+	//Global thread number
+	int thread_num = blockIdx.x * BLOCK_SIZE + threadIdx.x;
+	heap myHeap;
+	myHeap.entries = heapInit(K_size);
+	myHeap.current_count = 0;
+	//Start at root node
+	stack_entry* s = stack;
+	uint startpos = 1;
+	recursive_warp_search(static_data, query_points[thread_num], 1, &myHeap, startpos,s,stack);
+	ret[thread_num] = myHeap;
+} 
+
@@ -64,15 +64,6 @@ kernel void knnKDTree(	const global T* cloud,
 			continue;
 		const int index = node->index;
 		const global T* p = &cloud[index * POINT_STRIDE];
-		// check whether we already have better dist
-		if ((s->state == OFFSIDE) && (cd >= 0))
-		{
-			const T diff = q[cd] - p[cd];
-			// TODO: use approximate dist
-			// FIXME: bug in this early out
-			//if (diff * diff > heapHeadValue(heap))
-			//	continue;
-		}
 		// compute new distance and update if lower
 		T dist = 0;
 		for (uint i = 0; i < DIM_COUNT; ++i)