Check problem type and raise an exception when the backend is taking …

…too long to complete
kxytechnologies · Mar 27, 2022 · 861f606 · 861f606
1 parent a97d136
commit 861f606
Show file tree

Hide file tree

Showing 15 changed files with 122 additions and 39 deletions.
diff --git a/docker/kxy/Dockerfile b/docker/kxy/Dockerfile
@@ -21,7 +21,7 @@ RUN pip install botocore
 RUN pip install boto3
 
 # Install kxy
-RUN pip install kxy==1.4.2
+RUN pip install kxy==1.4.3
 
 # Copy examples into the Notebooks folder
 RUN git clone https://github.com/kxytechnologies/kxy-python.git /opt/kxy-python

diff --git a/kxy/__init__.py b/kxy/__init__.py
@@ -19,7 +19,7 @@
 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
-__version__ = "1.4.2"
+__version__ = "1.4.3"
 
 from kxy.api import *
 from kxy.pre_learning import *

diff --git a/kxy/examples/numerai_example.py b/kxy/examples/numerai_example.py
@@ -0,0 +1,81 @@
+from sklearn.metrics import r2_score
+import kxy
+import pandas as pd
+from kxy.learning import get_lightgbm_learner_sklearn_api
+
+########
+# Data #
+########
+## Uncomemnt to download Numerai data
+# from numerapi import NumerAPI
+# napi = NumerAPI()
+# current_round = napi.get_current_round(tournament=8)
+# napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet")
+
+df = pd.read_parquet('numerai_training_data_int8.parquet')
+target_column, problem_type = 'target', 'regression'
+feature_columns = [_ for _ in df.columns if _.startswith('feature_')]
+columns = feature_columns + [target_column]
+df = df[columns]
+
+
+####################
+# Train/Test Split #
+####################
+random_seed = 0
+test_df = df.sample(frac=0.8, random_state=random_seed)
+train_df = df.drop(test_df.index)
+train_features = train_df[feature_columns]
+train_labels = train_df[[target_column]]
+test_features = test_df[feature_columns]
+test_labels = test_df[[target_column]]
+
+
+##########################
+# With Feature Selection #
+##########################
+# LightGBM model factory
+lightgbm_regressor_learner_cls = get_lightgbm_learner_sklearn_api('lightgbm.LGBMRegressor', \
+    n_jobs=-1, colsample_bytree=0.1, learning_rate=0.01, n_estimators=2000, max_depth=5)
+
+# Lean boosting fit
+results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
+    problem_type=problem_type, feature_selection_method='leanml', \
+    data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed, \
+    snr='high')
+
+predictor = results['predictor']
+selected_features = predictor.selected_variables
+
+print('Selected Variables')
+print(selected_features)
+
+# Training/Testing Predictions
+train_predictions = predictor.predict(train_features)
+test_predictions = predictor.predict(test_features)
+
+# Training/Testing Performance
+train_r2 = r2_score(train_labels, train_predictions)
+test_r2 = r2_score(test_labels, test_predictions)
+
+print('Compressed LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (train_r2, test_r2))
+
+
+#################################
+# Fit Without Feature Selection #
+#################################
+results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
+    problem_type=problem_type, feature_selection_method=None)
+naive_predictor = results['predictor']
+
+# Training/Testing Predictions
+naive_train_predictions = naive_predictor.predict(train_features)
+naive_test_predictions = naive_predictor.predict(test_features)
+
+# Training/Testing Performance
+naive_train_r2 = r2_score(train_labels, naive_train_predictions)
+naive_test_r2 = r2_score(test_labels, naive_test_predictions)
+
+print('Naive LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (naive_train_r2, naive_test_r2))
+
+
diff --git a/kxy/misc/__init__.py b/kxy/misc/__init__.py
@@ -19,4 +19,5 @@
 """
 from .boruta import *
 from .rfe import *
-from .predictors import *
+from .predictors import *
+from .exceptions import *
diff --git a/kxy/misc/exceptions.py b/kxy/misc/exceptions.py
@@ -0,0 +1,4 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+class LongerThanExpectedException(Exception):
+	pass
diff --git a/kxy/misc/predictors.py b/kxy/misc/predictors.py
@@ -133,7 +133,8 @@ def fit(self, obj, target_column, learner_func, n_features, max_duration=None, p
 		x_df = obj[x_columns]
 		y_df = obj[[target_column]]
 
-		feature_selector = RFE(learner_func, path=path + '-' + self.__class__.__name__)
+		derived_path = None if path is None else path + '-' + self.__class__.__name__
+		feature_selector = RFE(learner_func, path=derived_path)
 		m = feature_selector.fit(x_df, y_df, n_features, max_duration=max_duration)
 		self.models = [m]
 		self.selected_variables = feature_selector.selected_variables
@@ -187,7 +188,8 @@ def fit(self, obj, target_column, learner_func, n_evaluations=20, pval=0.95, max
 		x_df = obj[x_columns]
 		y_df = obj[[target_column]]
 
-		feature_selector = Boruta(learner_func, path=path + '-' + self.__class__.__name__)
+		derived_path = None if path is None else path + '-' + self.__class__.__name__
+		feature_selector = Boruta(learner_func, path=derived_path)
 		m = feature_selector.fit(x_df, y_df, n_evaluations=n_evaluations, pval=pval, max_duration=max_duration)
 		self.models = [m]
 		self.selected_variables = feature_selector.selected_variables
@@ -235,7 +237,8 @@ def fit(self, obj, target_column, learner_func, path=None):
 		x_df = obj[x_columns]
 		y_df = obj[[target_column]]
 
-		feature_selector = NaiveLearner(learner_func, path=path + '-' + self.__class__.__name__)
+		derived_path = None if path is None else path + '-' + self.__class__.__name__
+		feature_selector = NaiveLearner(learner_func, path=derived_path)
 		m = feature_selector.fit(x_df, y_df)
 		self.models = [m]
 		self.selected_variables = feature_selector.selected_variables

diff --git a/kxy/pandas_extension/base_accessor.py b/kxy/pandas_extension/base_accessor.py
@@ -21,6 +21,14 @@ def __init__(self, pandas_obj):
 		self._obj = pandas_obj
 
 
+	def check_problem_type(self, problem_type, target_column):
+		if problem_type == 'regression':
+			try:
+				y = self._obj[target_column].astype(float)
+			except:
+				raise ValueError('You specified regression as problem_type but the target column is not numeric')
+
+
 	def is_discrete(self, column):
 		"""
 		Determine whether the input column contains discrete (i.e as opposed to continuous) observations.

diff --git a/kxy/pandas_extension/post_learning_accessor.py b/kxy/pandas_extension/post_learning_accessor.py
@@ -60,6 +60,7 @@ def data_driven_improvability(self, target_column, new_variables, problem_type=N
 		assert target_column in self._obj.columns, 'The target_column should be a column'
 		if problem_type is None:
 			problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
+		self.check_problem_type(problem_type, target_column)
 
 		_obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj
 
@@ -112,6 +113,7 @@ def model_driven_improvability(self, target_column, prediction_column, problem_t
 		assert prediction_column in self._obj.columns, 'The prediction_column should be a column'
 		if problem_type is None:
 			problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
+		self.check_problem_type(problem_type, target_column)
 
 		_obj = self.anonymize(columns_to_exclude=[target_column, prediction_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj
 
@@ -163,6 +165,7 @@ def model_explanation(self, prediction_column, problem_type=None, anonymize=None
 		assert prediction_column in self._obj.columns, 'The prediction_column should be a column'
 		if problem_type is None:
 			problem_type = 'classification' if self.is_discrete(prediction_column) else 'regression'
+		self.check_problem_type(problem_type, target_column)
 
 		_obj = self.anonymize(columns_to_exclude=[prediction_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj
 

diff --git a/kxy/pandas_extension/pre_learning_accessor.py b/kxy/pandas_extension/pre_learning_accessor.py
@@ -60,6 +60,7 @@ def data_valuation(self, target_column, problem_type=None, anonymize=None, snr='
 		assert target_column in self._obj.columns, 'The target_column should be a column'
 		if problem_type is None:
 			problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
+		self.check_problem_type(problem_type, target_column)
 
 		_obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj
 
@@ -106,7 +107,8 @@ def variable_selection(self, target_column, problem_type=None, anonymize=None, s
 		assert target_column in self._obj.columns, 'The target_column should be a column'
 		if problem_type is None:
 			problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
-
+		self.check_problem_type(problem_type, target_column)
+
 		_obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj
 
 		return vs(_obj, target_column, problem_type, snr=snr, file_name=file_name)

diff --git a/kxy/post_learning/improvability.py b/kxy/post_learning/improvability.py
@@ -20,6 +20,7 @@
 	from halo import Halo
 
 from kxy.api import APIClient, upload_data
+from kxy.misc import LongerThanExpectedException
 
 # Cache old job ids to avoid being charged twice for the same job.
 DD_IMPROVABILITY_JOB_IDS = {}
@@ -310,6 +311,8 @@ def model_driven_improvability(data_df, target_column, prediction_column, proble
 			except:
 				logging.error('\nModel-driven improvability failed. Last HTTP code: %s' % api_response.status_code)
 
+	raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')
+
 	return None
 
 

diff --git a/kxy/post_learning/model_explanation.py b/kxy/post_learning/model_explanation.py
@@ -20,6 +20,7 @@
 	from halo import Halo
 
 from kxy.api import APIClient, upload_data
+from kxy.misc import LongerThanExpectedException
 
 # Cache old job ids to avoid being charged twice for the same job.
 EXPLANATION_JOB_IDS = {}
@@ -174,6 +175,8 @@ def model_explanation(data_df, prediction_column, problem_type, snr='auto', file
 			except:
 				logging.error('\nModel explanation failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content))
 
+	raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')
+
 	return None
 
 
diff --git a/kxy/pre_learning/achievable_performance.py b/kxy/pre_learning/achievable_performance.py
@@ -20,6 +20,7 @@
 	from halo import Halo
 
 from kxy.api import APIClient, upload_data
+from kxy.misc import LongerThanExpectedException
 
 # Cache old job ids to avoid being charged twice for the same job.
 VALUATION_JOB_IDS = {}
@@ -165,6 +166,8 @@ def data_valuation(data_df, target_column, problem_type, snr='auto', include_mut
 			except:
 				logging.error('\nData valuation failed. Last HTTP code: %s' % api_response.status_code)
 
+	raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')
+
 	return None
 
 
diff --git a/kxy/pre_learning/variable_selection.py b/kxy/pre_learning/variable_selection.py
@@ -20,6 +20,7 @@
 	from halo import Halo
 
 from kxy.api import APIClient, upload_data
+from kxy.misc import LongerThanExpectedException
 
 # Cache old job ids to avoid being charged twice for the same job.
 VARIABLE_SELECTION_JOB_IDS = {}
@@ -170,4 +171,6 @@ def variable_selection(data_df, target_column, problem_type, snr='auto', file_na
 			except:
 				logging.error('\nVariable selection failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content))
 
+	raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')
+
 	return None
diff --git a/setup.py b/setup.py
@@ -12,7 +12,7 @@
 with open('README.md') as f:
 	long_description = f.read()
 
-version = "1.4.2"
+version = "1.4.3"
 setup(name="kxy",
 	version=version,
 	zip_safe=False,

diff --git a/tests/test_numerai.py b/tests/test_numerai.py