EasonLiao · ralic · May 21, 2017
diff --git a/cudatree/__init__.py b/cudatree/__init__.py
@@ -1,6 +1,6 @@
-from random_forest import RandomForestClassifier, convert_result
-from datasource import load_data
-from util import timer
-from random_tree import RandomClassifierTree 
+from .random_forest import RandomForestClassifier, convert_result
+from .datasource import load_data
+from .util import timer
+from .random_tree import RandomClassifierTree 
 __version__ = "0.6"
 
diff --git a/cudatree/base_tree.py b/cudatree/base_tree.py
@@ -1,8 +1,8 @@
 import numpy as np
-from util import get_best_dtype
+from .util import get_best_dtype
 from pycuda import gpuarray
 import math
-from util import start_timer, end_timer
+from .util import start_timer, end_timer
 from pycuda import driver
 
 class BaseTree(object):
@@ -14,12 +14,12 @@ def print_tree(self):
     def recursive_print(idx, depth):
       if self.left_children[idx] == 0 and \
           self.right_children[idx] == 0:
-        print "[LEAF] Depth: %s, Value: %s" % \
-            (depth, self.values_array[idx])
+        print("[LEAF] Depth: %s, Value: %s" % \
+            (depth, self.values_array[idx]))
       else:
-        print "[NODE] Depth: %s, Feature: %s, Threshold: %f" %\
+        print("[NODE] Depth: %s, Feature: %s, Threshold: %f" %\
             (depth, self.feature_idx_array[idx], 
-            self.feature_threshold_array[idx])
+            self.feature_threshold_array[idx]))
         recursive_print(self.left_children[idx], depth + 1)
         recursive_print(self.right_children[idx], depth + 1) 
     recursive_print(0, 0)

diff --git a/cudatree/datasource.py b/cudatree/datasource.py
@@ -1,4 +1,4 @@
-import cPickle
+import pickle
 import numpy as np
 from os import path
 from sklearn.datasets import load_digits,load_iris,load_diabetes,fetch_covtype 
@@ -27,33 +27,33 @@ def load_data(ds_name):
     y_train = ds.target 
   elif ds_name == "cf10":
     with open(data_dir + "data_batch_1", "r") as f:
-      ds = cPickle.load(f)
+      ds = pickle.load(f)
       x_train = ds['data']
       y_train = np.array(ds['labels'])
   elif ds_name == "cf100":
     with open(data_dir + "train", "r") as f:
-      ds = cPickle.load(f)
+      ds = pickle.load(f)
       x_train = ds['data']
       y_train = np.array(ds['fine_labels'])
   elif ds_name == "cd10_test":
     with open(data_dir + "test_batch", "r") as f:
-      ds = cPickle.load(f)
+      ds = pickle.load(f)
       x_train = ds['data']
       y_train = np.array(ds['labels'])
   elif ds_name == "cf100_test":
     with open(data_dir + "test", "r") as f:
-      ds = cPickle.load(f)
+      ds = pickle.load(f)
       x_train = ds['data']
       y_train = np.array(ds['fine_labels'])
   elif ds_name == "inet":
     if _img_data is None:
       with open("/ssd/imagenet-subset.pickle", "r") as f:
-        _img_data = cPickle.load(f)
+        _img_data = pickle.load(f)
     return _img_data['x'][0:10000],  _img_data['Y'][0:10000] 
   elif ds_name == "inet_test":
     if _img_data is None:
       with open("/ssd/imagenet-subset.pickle", "r") as f:
-        _img_data = cPickle.load(f)
+        _img_data = pickle.load(f)
     return _img_data['x'][10000:],  _img_data['Y'][10000:] 
   elif ds_name == "kdd":
     data = np.load(data_dir + "data.npy")

diff --git a/cudatree/random_forest.py b/cudatree/random_forest.py
@@ -1,9 +1,9 @@
 import numpy as np
-from random_tree import RandomClassifierTree
-from util import timer, get_best_dtype, dtype_to_ctype, mk_kernel, mk_tex_kernel, compile_module
+from .random_tree import RandomClassifierTree
+from .util import timer, get_best_dtype, dtype_to_ctype, mk_kernel, mk_tex_kernel, compile_module
 from pycuda import gpuarray
 from pycuda import driver
-from util import start_timer, end_timer, show_timings
+from .util import start_timer, end_timer, show_timings
 from parakeet import jit
 import math
 
@@ -314,21 +314,21 @@ def fit(self, samples, target, bfs_threshold = None):
       self.bfs_threshold = bfs_threshold
 
     if self.verbose: 
-      print "bsf_threadshold : %d; bootstrap : %r; min_samples_split : %d" % (self.bfs_threshold, 
-          self.bootstrap,  self.min_samples_split)
-      print "n_samples : %d; n_features : %d; n_labels : %d; max_features : %d" % (self.stride, 
-          self.n_features, self.n_labels, self.max_features)
+      print("bsf_threadshold : %d; bootstrap : %r; min_samples_split : %d" % (self.bfs_threshold, 
+          self.bootstrap,  self.min_samples_split))
+      print("n_samples : %d; n_features : %d; n_labels : %d; max_features : %d" % (self.stride, 
+          self.n_features, self.n_labels, self.max_features))
 
 
-    self._trees = [RandomClassifierTree(self) for i in xrange(self.n_estimators)] 
+    self._trees = [RandomClassifierTree(self) for i in range(self.n_estimators)] 
 
     for i, tree in enumerate(self._trees):
       si, n_samples = self._get_sorted_indices(self.sorted_indices)
 
       if self.verbose: 
         with timer("Tree %s" % (i,)):
           tree.fit(self.samples, self.target, si, n_samples)   
-        print ""
+        print("")
       else:
         tree.fit(self.samples, self.target, si, n_samples)   
 
@@ -355,7 +355,7 @@ def predict(self, x):
     for i, tree in enumerate(self._trees):
       res[i] =  tree.gpu_predict(x, self.predict_kernel)
 
-    res =  np.array([np.argmax(np.bincount(res[:,i])) for i in xrange(res.shape[1])]) 
+    res =  np.array([np.argmax(np.bincount(res[:,i])) for i in range(res.shape[1])]) 
     if hasattr(self, "compt_table"):
       res = convert_result(self.compt_table, res) 
 
@@ -369,7 +369,7 @@ def predict_proba(self, x):
     for i, tree in enumerate(self._trees):
       res[i] =  tree.gpu_predict(x, self.predict_kernel)
 
-    for i in xrange(x.shape[0]):
+    for i in range(x.shape[0]):
       tmp_res = np.bincount(res[:, i])
       tmp_res.resize(self.n_labels)
       res_proba[i] = tmp_res.astype(np.float64) / len(self._trees)

diff --git a/cudatree/random_tree.py b/cudatree/random_tree.py
@@ -3,15 +3,15 @@
 from pycuda import gpuarray
 import numpy as np
 import math
-from util import total_times, compile_module, mk_kernel, mk_tex_kernel, timer
-from util import  dtype_to_ctype, get_best_dtype, start_timer, end_timer
-from base_tree import BaseTree
+from .util import total_times, compile_module, mk_kernel, mk_tex_kernel, timer
+from .util import  dtype_to_ctype, get_best_dtype, start_timer, end_timer
+from .base_tree import BaseTree
 from pycuda import driver
 import random
 from parakeet import jit
-from util import start_timer, end_timer, show_timings
+from .util import start_timer, end_timer, show_timings
 import sys
-import util
+from . import util
 
 def sync():
   if False:
@@ -37,7 +37,7 @@ def restore_tree(left_children,
 
 @jit
 def  _shuffle(x, r):
-  for i in xrange(1, len(x)):
+  for i in range(1, len(x)):
     j = np.fmod(r[i], i)
     old_xj = x[j]
     x[j] = x[i]

diff --git a/cudatree/util.py b/cudatree/util.py
@@ -53,11 +53,11 @@ def __init__(self, name):
     self.name = name
 
   def __enter__(self, *args):
-    print "Running %s" % self.name 
+    print("Running %s" % self.name) 
     self.start_t = time.time()
 
   def __exit__(self, *args):
-    print "Time for %s: %s" % (self.name, time.time() - self.start_t)
+    print("Time for %s: %s" % (self.name, time.time() - self.start_t))
 
 def dtype_to_ctype(dtype):
   if dtype.kind == 'f':
@@ -126,15 +126,15 @@ def end_timer(name):
   total_times[name] = total
 
 def show_timings(limit = 100):
-  tables = sorted(total_times.iteritems(), 
+  tables = sorted(iter(total_times.items()), 
                   key = operator.itemgetter(1), 
                   reverse = True) 
   idx = 0
-  print "---------Timings---------"
+  print("---------Timings---------")
   for key, value in tables:
-    print key.ljust(15), ":", value
+    print(key.ljust(15), ":", value)
     idx += 1
     if idx == limit:
       break
 
-  print "-------------------------"
+  print("-------------------------")
diff --git a/estimate_threshold.py b/estimate_threshold.py
@@ -27,20 +27,20 @@
   rfs[f] = cudatree.RandomForestClassifier(n_estimators = 3, bootstrap = False, max_features = max_features)
 
 for n_classes in reversed(all_classes):
-  print "n_classes", n_classes
+  print("n_classes", n_classes)
   for n_examples in reversed(all_examples):
-    print "n_examples", n_examples
+    print("n_examples", n_examples)
     y = np.random.randint(low = 0, high = n_classes, size = n_examples)
     for n_features in reversed(all_features):
-      print "n_features", n_features
+      print("n_features", n_features)
       max_features = int(np.sqrt(n_features))
-      print "sqrt(n_features) =", max_features 
+      print("sqrt(n_features) =", max_features) 
       if n_features * n_examples > 10**7:
-        print "Skipping due excessive n_features * n_examples..."
+        print("Skipping due excessive n_features * n_examples...")
 	i += len(thresholds)
         continue
       if n_examples * n_classes > 10 ** 7:
-        print "Skipping due to excessive n_examples * n_classes"
+        print("Skipping due to excessive n_examples * n_classes")
 	i += len(thresholds)
         continue
 
@@ -51,38 +51,38 @@
       best_time = np.inf
       best_threshold = None
       best_threshold_prct = None 
-      print "(n_classes = %d, n_examples = %d, max_features = %d)" % (n_classes, n_examples, max_features)
+      print("(n_classes = %d, n_examples = %d, max_features = %d)" % (n_classes, n_examples, max_features))
       tested_thresholds = []
       times = []
       for bfs_threshold in thresholds:
         bfs_threshold_prct = float(bfs_threshold) / n_examples
-        print "  -- (%d / %d) threshold %d (%0.2f%%)" % (i, total_iters,  bfs_threshold, bfs_threshold_prct * 100)
+        print("  -- (%d / %d) threshold %d (%0.2f%%)" % (i, total_iters,  bfs_threshold, bfs_threshold_prct * 100))
         i += 1 
         if bfs_threshold > n_examples:
-          print "Skipping threshold > n_examples" 
+          print("Skipping threshold > n_examples") 
 	  continue 
         if bfs_threshold / float(n_examples) < 0.001:
-	  print "SKipping, BFS threshold too small relative to n_examples"
+	  print("SKipping, BFS threshold too small relative to n_examples")
 
 
         start_t = time.time()
         rf.fit(x, y, bfs_threshold)
         t = time.time() - start_t
         tested_thresholds.append(bfs_threshold)
         times.append(t)
-        print "  ---> total time", t 
+        print("  ---> total time", t) 
         if t < best_time:
           best_time = t
           best_threshold = bfs_threshold
           best_theshold_prct = bfs_threshold_prct
-      print "thresholds", tested_thresholds
-      print "times", times 
+      print("thresholds", tested_thresholds)
+      print("times", times) 
       inputs.append([1.0, n_classes, n_examples, max_features])
       best_threshold_values.append(best_threshold)
       best_threshold_prcts.append(best_threshold_prct)
 
 X = np.array(inputs)
-print "input shape", X.shape
+print("input shape", X.shape)
 
 
 
@@ -91,9 +91,9 @@
 Y = best_threshold_values
 
 lstsq_result = np.linalg.lstsq(X, Y)
-print "Regression coefficients:", lstsq_result[0]
+print("Regression coefficients:", lstsq_result[0])
 n = len(best_threshold_values)
-print "Regression residual:", lstsq_result[1], "RMSE:", np.sqrt(lstsq_result[1] / n)
+print("Regression residual:", lstsq_result[1], "RMSE:", np.sqrt(lstsq_result[1] / n))
 
 import socket 
 csv_filename = "threshold_results_" + socket.gethostname()
@@ -110,14 +110,14 @@
 
 log_lstsq_result = np.linalg.lstsq(LogX, LogY)
 
-print "Log regression coefficients:", log_lstsq_result[0]
+print("Log regression coefficients:", log_lstsq_result[0])
 n = len(best_threshold_values)
-print "Log regression residual:", log_lstsq_result[1], "RMSE:", np.sqrt(log_lstsq_result[1] / n)
+print("Log regression residual:", log_lstsq_result[1], "RMSE:", np.sqrt(log_lstsq_result[1] / n))
 log_pred = np.dot(LogX, log_lstsq_result[0])
 pred = np.exp(log_pred)
 residual = np.sum((Y - pred)**2)
-print "Actual residual", residual 
-print "Actual RMSE:", np.sqrt(residual / n)
+print("Actual residual", residual) 
+print("Actual RMSE:", np.sqrt(residual / n))
 
 
 """

diff --git a/hybridforest/__init__.py b/hybridforest/__init__.py
@@ -1 +1 @@
-from hybridforest import RandomForestClassifier
+from .hybridforest import RandomForestClassifier
diff --git a/hybridforest/hybridforest.py b/hybridforest/hybridforest.py
@@ -7,7 +7,7 @@
 from multiprocessing import Value, Lock, cpu_count
 import atexit
 import pycuda
-from builder import CPUBuilder, GPUBuilder
+from .builder import CPUBuilder, GPUBuilder
 
 #kill the child process if any
 def cleanup(proc):
@@ -149,7 +149,7 @@ def fit(self, X, Y, bfs_threshold = None):
                               self.max_features,
                               bfs_threshold,
                               remain_trees,
-                              lock) for i in xrange(self.n_gpus - 1)]
+                              lock) for i in range(self.n_gpus - 1)]
 
     pycuda.autoinit.context.pop()  
     for b in gpu_builders:
@@ -181,7 +181,7 @@ def predict(self, X):
     n_cd_trees = self.n_estimators - n_sk_trees
     cuda_proba = self._cuda_forest.predict_proba(X) * n_cd_trees
     final_proba = (sk_proba  + cuda_proba ) / self.n_estimators
-    res = np.array([np.argmax(final_proba[i]) for i in xrange(final_proba.shape[0])])
+    res = np.array([np.argmax(final_proba[i]) for i in range(final_proba.shape[0])])
 
     if hasattr(self._cuda_forest, "compt_table"):
       res = convert_result(self._cuda_forest.compt_table, res)

diff --git a/test/helpers.py b/test/helpers.py
@@ -12,14 +12,14 @@ def compare_accuracy(x,y, n_estimators = 11, bootstrap = True, slop = 0.98, n_re
   skrf = sklearn.ensemble.RandomForestClassifier(n_estimators = n_estimators, bootstrap = bootstrap)
   cuda_score_total = 0 
   sk_score_total = 0
-  for i in xrange(n_repeat):
+  for i in range(n_repeat):
     cudarf.fit(xtrain, ytrain)
     skrf.fit(xtrain, ytrain)
     sk_score = skrf.score(xtest, ytest)
     cuda_score = cudarf.score(xtest, ytest)
-    print "Iteration", i 
-    print "Sklearn score", sk_score 
-    print "CudaTree score", cuda_score 
+    print("Iteration", i) 
+    print("Sklearn score", sk_score) 
+    print("CudaTree score", cuda_score) 
     sk_score_total += sk_score 
     cuda_score_total += cuda_score 
 
@@ -39,14 +39,14 @@ def compare_hybrid_accuracy(x,y, n_estimators = 20, bootstrap = True, slop = 0.9
   skrf = sklearn.ensemble.RandomForestClassifier(n_estimators = n_estimators, bootstrap = bootstrap)
   cuda_score_total = 0 
   sk_score_total = 0
-  for i in xrange(n_repeat):
+  for i in range(n_repeat):
     hybridrf.fit(xtrain, ytrain)
     skrf.fit(xtrain, ytrain)
     sk_score = skrf.score(xtest, ytest)
     cuda_score = hybridrf.score(xtest, ytest)
-    print "Iteration", i 
-    print "Sklearn score", sk_score 
-    print "Hybrid score", cuda_score 
+    print("Iteration", i) 
+    print("Sklearn score", sk_score) 
+    print("Hybrid score", cuda_score) 
     sk_score_total += sk_score 
     cuda_score_total += cuda_score 
 

diff --git a/test/test_covtype.py b/test/test_covtype.py
@@ -12,10 +12,10 @@ def test_covtype_memorize():
     forest.fit(x, y, bfs_threshold = 500000)
   with timer("Predict"):
     diff, total = util.test_diff(forest.predict(x), y)  
-    print "%s(Wrong)/%s(Total). The error rate is %f." % (diff, total, diff/float(total))
+    print("%s(Wrong)/%s(Total). The error rate is %f." % (diff, total, diff/float(total)))
   assert diff == 0, "Didn't perfectly memorize, got %d wrong" % diff
 
-from helpers import compare_accuracy, compare_hybrid_accuracy
+from .helpers import compare_accuracy, compare_hybrid_accuracy
 def test_covtype_accuracy():
   compare_accuracy(x,y)
   compare_hybrid_accuracy(x, y)