Skip to content

Commit

Permalink
Check problem type and raise an exception when the backend is taking …
Browse files Browse the repository at this point in the history
…too long to complete
  • Loading branch information
Yves-Laurent committed Mar 27, 2022
1 parent a97d136 commit 861f606
Show file tree
Hide file tree
Showing 15 changed files with 122 additions and 39 deletions.
2 changes: 1 addition & 1 deletion docker/kxy/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ RUN pip install botocore
RUN pip install boto3

# Install kxy
RUN pip install kxy==1.4.2
RUN pip install kxy==1.4.3

# Copy examples into the Notebooks folder
RUN git clone https://github.com/kxytechnologies/kxy-python.git /opt/kxy-python
Expand Down
2 changes: 1 addition & 1 deletion kxy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
__version__ = "1.4.2"
__version__ = "1.4.3"

from kxy.api import *
from kxy.pre_learning import *
Expand Down
81 changes: 81 additions & 0 deletions kxy/examples/numerai_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
from sklearn.metrics import r2_score
import kxy
import pandas as pd
from kxy.learning import get_lightgbm_learner_sklearn_api

########
# Data #
########
## Uncomemnt to download Numerai data
# from numerapi import NumerAPI
# napi = NumerAPI()
# current_round = napi.get_current_round(tournament=8)
# napi.download_dataset("numerai_training_data_int8.parquet", "numerai_training_data_int8.parquet")

df = pd.read_parquet('numerai_training_data_int8.parquet')
target_column, problem_type = 'target', 'regression'
feature_columns = [_ for _ in df.columns if _.startswith('feature_')]
columns = feature_columns + [target_column]
df = df[columns]


####################
# Train/Test Split #
####################
random_seed = 0
test_df = df.sample(frac=0.8, random_state=random_seed)
train_df = df.drop(test_df.index)
train_features = train_df[feature_columns]
train_labels = train_df[[target_column]]
test_features = test_df[feature_columns]
test_labels = test_df[[target_column]]


##########################
# With Feature Selection #
##########################
# LightGBM model factory
lightgbm_regressor_learner_cls = get_lightgbm_learner_sklearn_api('lightgbm.LGBMRegressor', \
n_jobs=-1, colsample_bytree=0.1, learning_rate=0.01, n_estimators=2000, max_depth=5)

# Lean boosting fit
results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
problem_type=problem_type, feature_selection_method='leanml', \
data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed, \
snr='high')

predictor = results['predictor']
selected_features = predictor.selected_variables

print('Selected Variables')
print(selected_features)

# Training/Testing Predictions
train_predictions = predictor.predict(train_features)
test_predictions = predictor.predict(test_features)

# Training/Testing Performance
train_r2 = r2_score(train_labels, train_predictions)
test_r2 = r2_score(test_labels, test_predictions)

print('Compressed LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (train_r2, test_r2))


#################################
# Fit Without Feature Selection #
#################################
results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
problem_type=problem_type, feature_selection_method=None)
naive_predictor = results['predictor']

# Training/Testing Predictions
naive_train_predictions = naive_predictor.predict(train_features)
naive_test_predictions = naive_predictor.predict(test_features)

# Training/Testing Performance
naive_train_r2 = r2_score(train_labels, naive_train_predictions)
naive_test_r2 = r2_score(test_labels, naive_test_predictions)

print('Naive LightGBM: Training R^2: %.4f, Testing R^2: %.4f' % (naive_train_r2, naive_test_r2))


3 changes: 2 additions & 1 deletion kxy/misc/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@
"""
from .boruta import *
from .rfe import *
from .predictors import *
from .predictors import *
from .exceptions import *
4 changes: 4 additions & 0 deletions kxy/misc/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
class LongerThanExpectedException(Exception):
pass
9 changes: 6 additions & 3 deletions kxy/misc/predictors.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ def fit(self, obj, target_column, learner_func, n_features, max_duration=None, p
x_df = obj[x_columns]
y_df = obj[[target_column]]

feature_selector = RFE(learner_func, path=path + '-' + self.__class__.__name__)
derived_path = None if path is None else path + '-' + self.__class__.__name__
feature_selector = RFE(learner_func, path=derived_path)
m = feature_selector.fit(x_df, y_df, n_features, max_duration=max_duration)
self.models = [m]
self.selected_variables = feature_selector.selected_variables
Expand Down Expand Up @@ -187,7 +188,8 @@ def fit(self, obj, target_column, learner_func, n_evaluations=20, pval=0.95, max
x_df = obj[x_columns]
y_df = obj[[target_column]]

feature_selector = Boruta(learner_func, path=path + '-' + self.__class__.__name__)
derived_path = None if path is None else path + '-' + self.__class__.__name__
feature_selector = Boruta(learner_func, path=derived_path)
m = feature_selector.fit(x_df, y_df, n_evaluations=n_evaluations, pval=pval, max_duration=max_duration)
self.models = [m]
self.selected_variables = feature_selector.selected_variables
Expand Down Expand Up @@ -235,7 +237,8 @@ def fit(self, obj, target_column, learner_func, path=None):
x_df = obj[x_columns]
y_df = obj[[target_column]]

feature_selector = NaiveLearner(learner_func, path=path + '-' + self.__class__.__name__)
derived_path = None if path is None else path + '-' + self.__class__.__name__
feature_selector = NaiveLearner(learner_func, path=derived_path)
m = feature_selector.fit(x_df, y_df)
self.models = [m]
self.selected_variables = feature_selector.selected_variables
Expand Down
8 changes: 8 additions & 0 deletions kxy/pandas_extension/base_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ def __init__(self, pandas_obj):
self._obj = pandas_obj


def check_problem_type(self, problem_type, target_column):
if problem_type == 'regression':
try:
y = self._obj[target_column].astype(float)
except:
raise ValueError('You specified regression as problem_type but the target column is not numeric')


def is_discrete(self, column):
"""
Determine whether the input column contains discrete (i.e as opposed to continuous) observations.
Expand Down
3 changes: 3 additions & 0 deletions kxy/pandas_extension/post_learning_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def data_driven_improvability(self, target_column, new_variables, problem_type=N
assert target_column in self._obj.columns, 'The target_column should be a column'
if problem_type is None:
problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
self.check_problem_type(problem_type, target_column)

_obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj

Expand Down Expand Up @@ -112,6 +113,7 @@ def model_driven_improvability(self, target_column, prediction_column, problem_t
assert prediction_column in self._obj.columns, 'The prediction_column should be a column'
if problem_type is None:
problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
self.check_problem_type(problem_type, target_column)

_obj = self.anonymize(columns_to_exclude=[target_column, prediction_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj

Expand Down Expand Up @@ -163,6 +165,7 @@ def model_explanation(self, prediction_column, problem_type=None, anonymize=None
assert prediction_column in self._obj.columns, 'The prediction_column should be a column'
if problem_type is None:
problem_type = 'classification' if self.is_discrete(prediction_column) else 'regression'
self.check_problem_type(problem_type, target_column)

_obj = self.anonymize(columns_to_exclude=[prediction_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj

Expand Down
4 changes: 3 additions & 1 deletion kxy/pandas_extension/pre_learning_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ def data_valuation(self, target_column, problem_type=None, anonymize=None, snr='
assert target_column in self._obj.columns, 'The target_column should be a column'
if problem_type is None:
problem_type = 'classification' if self.is_discrete(target_column) else 'regression'
self.check_problem_type(problem_type, target_column)

_obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj

Expand Down Expand Up @@ -106,7 +107,8 @@ def variable_selection(self, target_column, problem_type=None, anonymize=None, s
assert target_column in self._obj.columns, 'The target_column should be a column'
if problem_type is None:
problem_type = 'classification' if self.is_discrete(target_column) else 'regression'

self.check_problem_type(problem_type, target_column)

_obj = self.anonymize(columns_to_exclude=[target_column]) if anonymize or (anonymize is None and self.is_too_large) else self._obj

return vs(_obj, target_column, problem_type, snr=snr, file_name=file_name)
Expand Down
3 changes: 3 additions & 0 deletions kxy/post_learning/improvability.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from halo import Halo

from kxy.api import APIClient, upload_data
from kxy.misc import LongerThanExpectedException

# Cache old job ids to avoid being charged twice for the same job.
DD_IMPROVABILITY_JOB_IDS = {}
Expand Down Expand Up @@ -310,6 +311,8 @@ def model_driven_improvability(data_df, target_column, prediction_column, proble
except:
logging.error('\nModel-driven improvability failed. Last HTTP code: %s' % api_response.status_code)

raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')

return None


Expand Down
3 changes: 3 additions & 0 deletions kxy/post_learning/model_explanation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from halo import Halo

from kxy.api import APIClient, upload_data
from kxy.misc import LongerThanExpectedException

# Cache old job ids to avoid being charged twice for the same job.
EXPLANATION_JOB_IDS = {}
Expand Down Expand Up @@ -174,6 +175,8 @@ def model_explanation(data_df, prediction_column, problem_type, snr='auto', file
except:
logging.error('\nModel explanation failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content))

raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')

return None


3 changes: 3 additions & 0 deletions kxy/pre_learning/achievable_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from halo import Halo

from kxy.api import APIClient, upload_data
from kxy.misc import LongerThanExpectedException

# Cache old job ids to avoid being charged twice for the same job.
VALUATION_JOB_IDS = {}
Expand Down Expand Up @@ -165,6 +166,8 @@ def data_valuation(data_df, target_column, problem_type, snr='auto', include_mut
except:
logging.error('\nData valuation failed. Last HTTP code: %s' % api_response.status_code)

raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')

return None


3 changes: 3 additions & 0 deletions kxy/pre_learning/variable_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from halo import Halo

from kxy.api import APIClient, upload_data
from kxy.misc import LongerThanExpectedException

# Cache old job ids to avoid being charged twice for the same job.
VARIABLE_SELECTION_JOB_IDS = {}
Expand Down Expand Up @@ -170,4 +171,6 @@ def variable_selection(data_df, target_column, problem_type, snr='auto', file_na
except:
logging.error('\nVariable selection failed. Last HTTP code: %s, Content: %s' % (api_response.status_code, api_response.content))

raise LongerThanExpectedException('The backend is taking longer than expected, but rest reassured your task is still running. Please try again later to retrieve your results.')

return None
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
with open('README.md') as f:
long_description = f.read()

version = "1.4.2"
version = "1.4.3"
setup(name="kxy",
version=version,
zip_safe=False,
Expand Down
31 changes: 0 additions & 31 deletions tests/test_numerai.py

This file was deleted.

0 comments on commit 861f606

Please sign in to comment.