diff --git a/.gitignore b/.gitignore index 25b8389..e4e2ed0 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,5 @@ UCI*/ local_*.py *do-not-commit* AutogluonModels* +*-PFSPredictor +*-PCAPredictor diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..e69de29 diff --git a/CITATION.cff b/CITATION.cff index 5718ebc..cf53c4c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -5,7 +5,7 @@ authors: given-names: Yves-Laurent orcid: "https://orcid.org/0000-0003-2901-6930" title: KXY: A Seemless API to 10x The Productivity of Machine Learning Engineers. -version: 1.2.33 +version: 1.4.3 date-released: "2021-10-12" abstract: KXY is a powerful serverless analysis toolkit that takes trial-and-error out of machine learning projects. url: "https://github.com/kxytechnologies/kxy-python" diff --git a/Makefile b/Makefile index 63924b8..7c31b71 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -VERSION = 1.4.2 +VERSION = 1.4.4 # Update the s3 bucket of the docs website deploy_docs: @@ -25,9 +25,11 @@ docker_release: docker_release_github: docker build -t ghcr.io/kxytechnologies/kxy-python:latest ./docker/kxy/ - echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest + # echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest + docker push ghcr.io/kxytechnologies/kxy-python:latest docker build -t ghcr.io/kxytechnologies/kxy-python:$(VERSION) ./docker/kxy/ - echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION) + # echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION) + docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION) one_shot_release: @@ -45,6 +47,17 @@ update_docs: make refresh_web PATHS=/reference/* +github_release: + gh release create v$(VERSION) -F CHANGELOG.md + + +package_release: + make pypi_release + make github_release + make docker_release_github + make docker_release + + osr: make one_shot_release diff --git a/kxy/__init__.py b/kxy/__init__.py index 4af6a2a..6466f81 100644 --- a/kxy/__init__.py +++ b/kxy/__init__.py @@ -4,7 +4,7 @@ __author__ = "Dr. Yves-Laurent Kom Samo" __copyright__ = "Copyright (C) 2022 KXY Technologies, Inc." __license__ = """ -Copyright (C) 2021 KXY TECHNOLOGIES, INC. +Copyright (C) 2022 KXY TECHNOLOGIES, INC. This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -19,7 +19,7 @@ You should have received a copy of the GNU Affero General Public License along with this program. If not, see . """ -__version__ = "1.4.3" +__version__ = "1.4.4" from kxy.api import * from kxy.pre_learning import * diff --git a/kxy/examples/numerai_example.py b/kxy/examples/numerai_example.py index c415f95..e4407a0 100644 --- a/kxy/examples/numerai_example.py +++ b/kxy/examples/numerai_example.py @@ -22,7 +22,7 @@ #################### # Train/Test Split # #################### -random_seed = 0 +random_seed = 2 test_df = df.sample(frac=0.8, random_state=random_seed) train_df = df.drop(test_df.index) train_features = train_df[feature_columns] @@ -40,15 +40,16 @@ # Lean boosting fit results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \ - problem_type=problem_type, feature_selection_method='leanml', \ - data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed, \ - snr='high') + problem_type=problem_type, feature_selection_method='pfs', pfs_p=100, \ + data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed) predictor = results['predictor'] -selected_features = predictor.selected_variables +p = predictor.feature_directions.shape[0] +print('Number of features: %d' % p) -print('Selected Variables') -print(selected_features) +# selected_features = predictor.selected_variables +# print('Selected Variables') +# print(selected_features) # Training/Testing Predictions train_predictions = predictor.predict(train_features) diff --git a/kxy/misc/mind.py b/kxy/misc/mind.py index cbfe24a..c9ac718 100644 --- a/kxy/misc/mind.py +++ b/kxy/misc/mind.py @@ -1,253 +1,13 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- - """ TensorFlow Implementation of MIND ([1]) under Spearman rank correlation constraints. [1] Kom Samo, Y. (2021). Inductive Mutual Information Estimation: A Convex Maximum-Entropy Copula Approach . Proceedings of The 24th International Conference on Artificial Intelligence and Statistics, in Proceedings of Machine Learning Research 130:2242-2250 Available from https://proceedings.mlr.press/v130/kom-samo21a.html. """ -import logging -logging.basicConfig(level=logging.INFO) - -from multiprocessing import Pool, cpu_count import numpy as np -import tensorflow as tf -tf.keras.backend.set_floatx('float64') -tf.config.threading.set_inter_op_parallelism_threads(2) -tf.config.threading.set_intra_op_parallelism_threads(8) -tf.config.set_soft_device_placement(True) - -from tensorflow.keras import Model -from tensorflow.keras.backend import pow as tf_pow -from tensorflow.keras.backend import cast, clip -from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN -from tensorflow.keras.layers import Dense, Lambda, concatenate, Dot, Dropout, Layer -from tensorflow.keras.losses import Loss -from tensorflow.keras.optimizers import Adam -from tensorflow.keras.utils import Sequence -from tensorflow.python.ops import math_ops - -rankdata = lambda x: 1.+np.argsort(np.argsort(x, axis=0), axis=0) - - - -class CopulaBatchGenerator(Sequence): - ''' - Random batch generator. - ''' - def __init__(self, z, batch_size=1000, steps_per_epoch=100): - self.batch_size = batch_size - self.d = z.shape[1] - self.n = z.shape[0] - self.z = z - self.steps_per_epoch = steps_per_epoch - self.emp_u = rankdata(self.z)/(self.n + 1.) - self.emp_u[np.isnan(self.z)] = 0.5 - - if self.n < 200*self.d: - dn = 200*self.d - self.n - selected_rows = np.random.choice(self.n, dn, replace=True) - emp_u = self.emp_u[selected_rows, :].copy() - scale = 1./(100.*self.n) - emp_u += (scale*np.random.rand(*emp_u.shape) - 0.5*scale) - self.emp_u = np.concatenate([self.emp_u, emp_u], axis=0) - self.n = self.emp_u.shape[0] - - self.batch_selector = np.random.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True) - self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size)) - - - def getitem_ndarray(self, idx): - ''' ''' - i = idx % self.steps_per_epoch - selected_rows = self.batch_selector[i] - emp_u_ = self.emp_u[selected_rows, :] - z_p = emp_u_.copy() - z_q = np.random.rand(*emp_u_.shape) - - z = np.empty((self.batch_size, self.d, 2)) - z[:, :, 0] = z_p - z[:, :, 1] = z_q - batch_x = z - batch_y = np.ones((self.batch_size, 2)) # Not used - return batch_x, batch_y - - - def __getitem__(self, idx): - ''' ''' - batch_x, batch_y = self.getitem_ndarray(idx) - return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y) - - - def __len__(self): - return self.steps_per_epoch - - - -class InitializableDense(Layer): - ''' - ''' - def __init__(self, units, initial_w=None, initial_b=None, bias=False): - ''' - initial_w should be None or a 2D numpy array. - initial_b should be None or a 1D numpy array. - ''' - super(InitializableDense, self).__init__() - self.units = units - self.with_bias = bias - self.w_initializer = 'zeros' if initial_w is None else tf.constant_initializer(initial_w) - - if self.with_bias: - self.b_initializer = 'zeros' if initial_b is None else tf.constant_initializer(initial_b) - - - def build(self, input_shape): - ''' ''' - self.w = self.add_weight(shape=(input_shape[-1], self.units), \ - initializer=self.w_initializer, trainable=True, name='quad_w') - - if self.with_bias: - self.b = self.add_weight(shape=(self.units,), \ - initializer=self.b_initializer, trainable=True, name='quad_b') - - - def call(self, inputs): - ''' ''' - return tf.matmul(inputs, self.w)+self.b if self.with_bias else tf.matmul(inputs, self.w) - - - - -class CopulaModel(Model): - ''' - Maximum-entropy copula under (possibly sparse) Spearman rank correlation constraints. - ''' - def __init__(self, d, subsets=[]): - super(CopulaModel, self).__init__() - self.d = d - if subsets == []: - subsets = [[_ for _ in range(d)]] - - self.subsets = subsets - self.n_subsets = len(self.subsets) - self.p_samples = Lambda(lambda x: x[:,:,0]) - self.q_samples = Lambda(lambda x: x[:,:,1]) - - self.fx_non_mon_layer_1s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)] - self.fx_non_mon_layer_2s = [Dense(5, activation=tf.nn.relu) for _ in range(self.n_subsets)] - self.fx_non_mon_layer_3s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)] - self.fx_non_mon_layer_4s = [Dense(1) for _ in range(self.n_subsets)] - - eff_ds = [len(subset)+1 for subset in self.subsets] - self.spears = [InitializableDense(eff_d) for eff_d in eff_ds] - self.dots = [Dot(1) for _ in range(self.n_subsets)] - - # Mixing layers - self.mixing_layer1 = Dense(5, activation=tf.nn.relu) - self.mixing_layer2 = Dense(5, activation=tf.nn.relu) - self.mixing_layer3 = Dense(1) - - - def subset_statistics(self, u, i): - ''' - Statistics function for the i-th subset of variables. - ''' - n = tf.shape(u)[0] - res = tf.zeros(shape=[n, 1], dtype=tf.float64) - ui = tf.gather(u, self.subsets[i], axis=1) - - # Constraints beyond quadratic - fui = self.fx_non_mon_layer_1s[i](ui) - fui = self.fx_non_mon_layer_2s[i](fui) - fui = self.fx_non_mon_layer_3s[i](fui) - fui = self.fx_non_mon_layer_4s[i](fui) - ui = concatenate([ui, fui], axis=1) - - # Spearman terms - spearman_term = self.spears[i](ui) - spearman_term = self.dots[i]([spearman_term, ui]) - res = tf.add(res, spearman_term) - - return res - - - def statistics(self, u): - ''' - Statistics function. - ''' - if self.n_subsets > 1: - ts = [self.subset_statistics(u, i) for i in range(self.n_subsets)] - t = concatenate(ts, axis=1) - t = self.mixing_layer1(t) - t = self.mixing_layer2(t) - t = self.mixing_layer3(t) - else: - t = self.subset_statistics(u, 0) - - return t - - - def call(self, inputs): - ''' ''' - p_samples = self.p_samples(inputs) - t_p = self.statistics(p_samples) - - q_samples = self.q_samples(inputs) - t_q = self.statistics(q_samples) - - t = concatenate([t_p, t_q], axis=1) - t = clip(t, -100., 100.) - - return t - - - def copula(self, inputs): - ''' ''' - u = tf.constant(inputs) - c = math_ops.exp(self.statistics(u)) - return c.numpy()/c.numpy().mean() - - - - -class MINDLoss(Loss): - ''' - Loss function. - ''' - def call(self, y_true, y_pred): - ''' ''' - p_samples = y_pred[:, 0] - q_samples = y_pred[:, 1] - mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(math_ops.exp(q_samples))) - return mi - - - -class CopulaLearner(object): - ''' - Maximum-entropy learner. - ''' - def __init__(self, d, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \ - name='Adam', lr=0.01, subsets=[]): - self.d = d - self.model = CopulaModel(self.d, subsets=subsets) - self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \ - name=name, lr=lr) - self.loss = MINDLoss() - self.model.compile(optimizer=self.opt, loss=self.loss) - self.copula_entropy = None - - - def fit(self, z, batch_size=10000, steps_per_epoch=1000): - ''' ''' - epochs = 20 - z_gen = CopulaBatchGenerator(z, batch_size=batch_size, steps_per_epoch=steps_per_epoch) - self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \ - callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()]) - self.copula_entropy = self.model.evaluate(z_gen) - - +from kxy.misc.tf import CopulaLearner def copula_entropy(z, subsets=[]): ''' diff --git a/kxy/misc/tf/__init__.py b/kxy/misc/tf/__init__.py new file mode 100644 index 0000000..d6c9461 --- /dev/null +++ b/kxy/misc/tf/__init__.py @@ -0,0 +1,24 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Copyright (C) 2022 KXY TECHNOLOGIES, INC. +Author: Dr Yves-Laurent Kom Samo + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +""" +from .generators import * +from .layers import * +from .losses import * +from .models import * +from .learners import * \ No newline at end of file diff --git a/kxy/misc/tf/generators.py b/kxy/misc/tf/generators.py new file mode 100644 index 0000000..6dd7de9 --- /dev/null +++ b/kxy/misc/tf/generators.py @@ -0,0 +1,139 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Custom Tensorflow generators. +""" +import numpy as np +import tensorflow as tf +tf.keras.backend.set_floatx('float64') +tf.config.threading.set_inter_op_parallelism_threads(2) +tf.config.threading.set_intra_op_parallelism_threads(8) +tf.config.set_soft_device_placement(True) +from tensorflow.keras.utils import Sequence + +rankdata = lambda x: 1.+np.argsort(np.argsort(x, axis=0), axis=0) +class CopulaBatchGenerator(Sequence): + ''' + Random batch generator of maximum-entropy copula learning. + ''' + def __init__(self, z, batch_size=1000, steps_per_epoch=100): + self.batch_size = batch_size + self.d = z.shape[1] + self.n = z.shape[0] + self.z = z + self.steps_per_epoch = steps_per_epoch + self.emp_u = rankdata(self.z)/(self.n + 1.) + self.emp_u[np.isnan(self.z)] = 0.5 + + if self.n < 200*self.d: + dn = 200*self.d - self.n + selected_rows = np.random.choice(self.n, dn, replace=True) + emp_u = self.emp_u[selected_rows, :].copy() + scale = 1./(100.*self.n) + emp_u += (scale*np.random.rand(*emp_u.shape) - 0.5*scale) + self.emp_u = np.concatenate([self.emp_u, emp_u], axis=0) + self.n = self.emp_u.shape[0] + + self.batch_selector = np.random.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True) + self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size)) + + + def getitem_ndarray(self, idx): + ''' ''' + i = idx % self.steps_per_epoch + selected_rows = self.batch_selector[i] + emp_u_ = self.emp_u[selected_rows, :] + z_p = emp_u_.copy() + z_q = np.random.rand(*emp_u_.shape) + + z = np.empty((self.batch_size, self.d, 2)) + z[:, :, 0] = z_p + z[:, :, 1] = z_q + batch_x = z + batch_y = np.ones((self.batch_size, 2)) # Not used + return batch_x, batch_y + + + def __getitem__(self, idx): + ''' ''' + batch_x, batch_y = self.getitem_ndarray(idx) + return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y) + + + def __len__(self): + return self.steps_per_epoch + + + +class PFSBatchGenerator(Sequence): + ''' + Random batch generator. + ''' + def __init__(self, x, y, ox=None, oy=None, batch_size=1000, steps_per_epoch=100, n_shuffle=5): + assert x.shape[0] == y.shape[0] + self.batch_size = batch_size + self.n_shuffle = n_shuffle + self.n = x.shape[0] + + x = x if len(x.shape) > 1 else x[:, None] + y = y if len(y.shape) > 1 else y[:, None] + ox = ox if ox is None or len(ox.shape) > 1 else ox[:, None] + oy = oy if oy is None or len(oy.shape) > 1 else oy[:, None] + + self.x = x + self.y = y + self.ox = ox + self.oy = oy + self.z = np.concatenate([self.x, self.y, self.ox, self.oy], axis=1) if not self.ox is None else np.concatenate([self.x, self.y], axis=1) + self.d = self.z.shape[1] + + self.steps_per_epoch = steps_per_epoch + replace = False if self.n > self.batch_size*self.steps_per_epoch else True + self.batch_selector = np.random.choice(self.n, self.batch_size*self.steps_per_epoch, replace=replace) + self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size)) + + + def getitem_ndarray(self, idx): + ''' ''' + i = idx % self.steps_per_epoch + selected_rows = self.batch_selector[i] + x_ = self.x[selected_rows, :] + y_ = self.y[selected_rows, :] + z_ = self.z[selected_rows, :] + if not self.ox is None: + ox_ = self.ox[selected_rows, :] + oy_ = self.oy[selected_rows, :] + + z_p = None + z_q = None + for _ in range(self.n_shuffle): + z_p = z_.copy() if z_p is None else np.concatenate([z_p, z_.copy()], axis=0) + y_q = y_.copy() + randomize = np.arange(y_q.shape[0]) + np.random.shuffle(randomize) + y_q = y_q[randomize] + if not self.ox is None: + oy_q = oy_.copy() + oy_q = oy_q[randomize] + z_q_ = np.concatenate([x_, y_q.copy(), ox_, oy_q], axis=1) if not self.ox is None else np.concatenate([x_, y_q.copy()], axis=1) + z_q = z_q_.copy() if z_q is None else np.concatenate([z_q, z_q_.copy()], axis=0) + + z = np.empty((self.batch_size*self.n_shuffle, self.d, 2)) + z[:, :, 0] = z_p + z[:, :, 1] = z_q + batch_x = z + batch_y = np.ones((self.batch_size*self.n_shuffle, 2)) # Not used + return batch_x, batch_y + + + def __getitem__(self, idx): + ''' ''' + batch_x, batch_y = self.getitem_ndarray(idx) + return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y) + + def __len__(self): + return self.steps_per_epoch + + + + diff --git a/kxy/misc/tf/layers.py b/kxy/misc/tf/layers.py new file mode 100644 index 0000000..d6f3fb8 --- /dev/null +++ b/kxy/misc/tf/layers.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Custom tensorflow layers. +""" +import tensorflow as tf +tf.keras.backend.set_floatx('float64') +tf.config.threading.set_inter_op_parallelism_threads(2) +tf.config.threading.set_intra_op_parallelism_threads(8) +tf.config.set_soft_device_placement(True) +from tensorflow.keras.layers import Layer + + +class InitializableDense(Layer): + ''' + ''' + def __init__(self, units, initial_w=None, initial_b=None, bias=False): + ''' + initial_w should be None or a 2D numpy array. + initial_b should be None or a 1D numpy array. + ''' + super(InitializableDense, self).__init__() + self.units = units + self.with_bias = bias + self.w_initializer = 'zeros' if initial_w is None else tf.constant_initializer(initial_w) + + if self.with_bias: + self.b_initializer = 'zeros' if initial_b is None else tf.constant_initializer(initial_b) + + + def build(self, input_shape): + ''' ''' + self.w = self.add_weight(shape=(input_shape[-1], self.units), \ + initializer=self.w_initializer, trainable=True, name='quad_w') + + if self.with_bias: + self.b = self.add_weight(shape=(self.units,), \ + initializer=self.b_initializer, trainable=True, name='quad_b') + + + def call(self, inputs): + ''' ''' + return tf.matmul(inputs, self.w)+self.b if self.with_bias else tf.matmul(inputs, self.w) diff --git a/kxy/misc/tf/learners.py b/kxy/misc/tf/learners.py new file mode 100644 index 0000000..227dedc --- /dev/null +++ b/kxy/misc/tf/learners.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Tensorflow learners. +""" +import numpy as np + +import tensorflow as tf +from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN +from tensorflow.keras.optimizers import Adam + +from .generators import CopulaBatchGenerator, PFSBatchGenerator +from .models import CopulaModel, PFSModel, PFSOneShotModel +from .losses import MINDLoss + + +class CopulaLearner(object): + ''' + Maximum-entropy learner. + ''' + def __init__(self, d, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \ + name='Adam', lr=0.01, subsets=[]): + self.d = d + self.model = CopulaModel(self.d, subsets=subsets) + self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \ + name=name, lr=lr) + self.loss = MINDLoss() + self.model.compile(optimizer=self.opt, loss=self.loss) + self.copula_entropy = None + + + def fit(self, z, batch_size=10000, steps_per_epoch=1000, epochs=20): + ''' ''' + z_gen = CopulaBatchGenerator(z, batch_size=batch_size, steps_per_epoch=steps_per_epoch) + self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \ + callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()]) + self.copula_entropy = self.model.evaluate(z_gen) + + + + +class PFSLearner(object): + ''' + Principal Feature Learner. + ''' + def __init__(self, dx, dy=1, dox=0, doy=0, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \ + name='Adam', lr=0.01): + x_ixs = [_ for _ in range(dx)] + y_ixs = [dx+_ for _ in range(dy)] + ox_ixs = [dx+dy+_ for _ in range(dox)] + oy_ixs = [dx+dy+dox+_ for _ in range(doy)] + + self.model = PFSModel(x_ixs, y_ixs, ox_ixs=ox_ixs, oy_ixs=oy_ixs) + self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \ + name=name, lr=lr) + self.loss = MINDLoss() + self.model.compile(optimizer=self.opt, loss=self.loss) + self.mutual_information = None + self.feature_direction = None + self.statistics = None + + + def fit(self, x, y, ox=None, oy=None, batch_size=500, n_shuffle=5, epochs=20, mi_eps=0.00001): + ''' ''' + n = x.shape[0] + steps_per_epoch = n//batch_size + steps_per_epoch = min(max(steps_per_epoch, 100), 1000) + + z_gen = PFSBatchGenerator(x, y, ox=ox, oy=oy, batch_size=batch_size, \ + steps_per_epoch=steps_per_epoch, n_shuffle=n_shuffle) + self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \ + callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()]) + self.mutual_information = -self.model.evaluate(z_gen) + w = self.model.w_layer.get_weights()[0] + + if self.mutual_information < mi_eps: + # Retrain to avoid MI collapse to 0. + batch_size = 2*batch_size + z_gen = PFSBatchGenerator(x, y, ox=ox, oy=oy, batch_size=batch_size, \ + steps_per_epoch=steps_per_epoch, n_shuffle=n_shuffle) + self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \ + callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()]) + self.mutual_information = -self.model.evaluate(z_gen) + w = self.model.w_layer.get_weights()[0] + + # The feature direction should be normal + w = w.flatten() + w = w/np.sqrt(np.dot(w, w)) + + # The principal feature should point in the same direction as the target (i.e. = cov(y, w^Tx) > 0) + corr_sign = np.sign(np.corrcoef(y, np.dot(x, w))[0, 1]) + w = corr_sign*w + + self.feature_direction = w + self.fx = self.model.fx(tf.constant(z_gen.z)).numpy() + self.gy = self.model.gy(tf.constant(z_gen.z)).numpy() + + + + +class PFSOneShotLearner(object): + ''' + Principal Feature Learner learning multiple principal features simultaneously. + ''' + def __init__(self, dx, dy=1, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \ + name='Adam', lr=0.01, p=1): + x_ixs = [_ for _ in range(dx)] + y_ixs = [dx+_ for _ in range(dy)] + + self.model = PFSOneShotModel(x_ixs, y_ixs, p=p) + self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \ + name=name, lr=lr) + self.loss = MINDLoss() + self.model.compile(optimizer=self.opt, loss=self.loss) + self.mutual_information = None + self.feature_direction = None + self.statistics = None + + + def fit(self, x, y, batch_size=500, n_shuffle=5, epochs=20, mi_eps=0.00001): + ''' ''' + n = x.shape[0] + steps_per_epoch = n//batch_size + steps_per_epoch = min(max(steps_per_epoch, 100), 1000) + + z_gen = PFSBatchGenerator(x, y, batch_size=batch_size, \ + steps_per_epoch=steps_per_epoch, n_shuffle=n_shuffle) + self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch) + self.mutual_information = -self.model.evaluate(z_gen) + w = self.model.w_layer.get_weights()[0] + + if self.mutual_information < mi_eps: + # Retrain to avoid MI collapse to 0. + batch_size = 2*batch_size + z_gen = PFSBatchGenerator(x, y, batch_size=batch_size, \ + steps_per_epoch=steps_per_epoch, n_shuffle=n_shuffle) + self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch) + self.mutual_information = -self.model.evaluate(z_gen) + w = self.model.w_layer.get_weights()[0] + + # The feature direction should be normal + # This should already have been taken care of downstream, but just in case. + w = w/np.linalg.norm(w, axis=0) + + # Each principal feature should point in the same direction as the target (i.e. = cov(y, w^Tx) > 0) + for j in range(w.shape[1]): + corr_sign = np.sign(np.corrcoef(y.flatten(), np.dot(x, w[:, j]))[0, 1]) + w[:, j] = corr_sign*w[:, j] + + w = w.T + + self.feature_directions = w + + + + + diff --git a/kxy/misc/tf/losses.py b/kxy/misc/tf/losses.py new file mode 100644 index 0000000..90f4575 --- /dev/null +++ b/kxy/misc/tf/losses.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Custom Tensorflow losses. +""" +from multiprocessing import Pool, cpu_count +import numpy as np + +import tensorflow as tf +tf.keras.backend.set_floatx('float64') +tf.config.threading.set_inter_op_parallelism_threads(2) +tf.config.threading.set_intra_op_parallelism_threads(8) +tf.config.set_soft_device_placement(True) +from tensorflow.python.ops import math_ops +from tensorflow.keras.losses import Loss + + +class MINDLoss(Loss): + ''' + Loss function. + ''' + def call(self, y_true, y_pred): + ''' ''' + p_samples = y_pred[:, 0] + q_samples = y_pred[:, 1] + mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(math_ops.exp(q_samples))) + return mi diff --git a/kxy/misc/tf/models.py b/kxy/misc/tf/models.py new file mode 100644 index 0000000..563e3c4 --- /dev/null +++ b/kxy/misc/tf/models.py @@ -0,0 +1,227 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Custom Tensorflow models. +""" +import numpy as np +import tensorflow as tf +tf.keras.backend.set_floatx('float64') +tf.config.threading.set_inter_op_parallelism_threads(2) +tf.config.threading.set_intra_op_parallelism_threads(8) +tf.config.set_soft_device_placement(True) + +from tensorflow.keras import Model +from tensorflow.keras.backend import clip +from tensorflow.keras.layers import Dense, Lambda, concatenate, Dot +from tensorflow.keras.constraints import UnitNorm + +from .layers import InitializableDense + + +class CopulaModel(Model): + """ + Maximum-entropy copula under (possibly sparse) Spearman rank correlation constraints. + """ + def __init__(self, d, subsets=[]): + super(CopulaModel, self).__init__() + self.d = d + if subsets == []: + subsets = [[_ for _ in range(d)]] + + self.subsets = subsets + self.n_subsets = len(self.subsets) + self.p_samples = Lambda(lambda x: x[:,:,0]) + self.q_samples = Lambda(lambda x: x[:,:,1]) + + self.fx_non_mon_layer_1s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)] + self.fx_non_mon_layer_2s = [Dense(5, activation=tf.nn.relu) for _ in range(self.n_subsets)] + self.fx_non_mon_layer_3s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)] + self.fx_non_mon_layer_4s = [Dense(1) for _ in range(self.n_subsets)] + + eff_ds = [len(subset)+1 for subset in self.subsets] + self.spears = [InitializableDense(eff_d) for eff_d in eff_ds] + self.dots = [Dot(1) for _ in range(self.n_subsets)] + + # Mixing layers + self.mixing_layer1 = Dense(5, activation=tf.nn.relu) + self.mixing_layer2 = Dense(5, activation=tf.nn.relu) + self.mixing_layer3 = Dense(1) + + + def subset_statistics(self, u, i): + ''' + Statistics function for the i-th subset of variables. + ''' + n = tf.shape(u)[0] + res = tf.zeros(shape=[n, 1], dtype=tf.float64) + ui = tf.gather(u, self.subsets[i], axis=1) + + # Constraints beyond quadratic + fui = self.fx_non_mon_layer_1s[i](ui) + fui = self.fx_non_mon_layer_2s[i](fui) + fui = self.fx_non_mon_layer_3s[i](fui) + fui = self.fx_non_mon_layer_4s[i](fui) + ui = concatenate([ui, fui], axis=1) + + # Spearman terms + spearman_term = self.spears[i](ui) + spearman_term = self.dots[i]([spearman_term, ui]) + res = tf.add(res, spearman_term) + return res + + + def statistics(self, u): + ''' + Statistics function. + ''' + if self.n_subsets > 1: + ts = [self.subset_statistics(u, i) for i in range(self.n_subsets)] + t = concatenate(ts, axis=1) + t = self.mixing_layer1(t) + t = self.mixing_layer2(t) + t = self.mixing_layer3(t) + else: + t = self.subset_statistics(u, 0) + return t + + + def call(self, inputs): + ''' + ''' + p_samples = self.p_samples(inputs) + t_p = self.statistics(p_samples) + + q_samples = self.q_samples(inputs) + t_q = self.statistics(q_samples) + + t = concatenate([t_p, t_q], axis=1) + t = clip(t, -100., 100.) + return t + + + def copula(self, inputs): + ''' + ''' + u = tf.constant(inputs) + c = math_ops.exp(self.statistics(u)) + return c.numpy()/c.numpy().mean() + + + + +class PFSOneShotModel(Model): + """ + Model for Principal Feature Selection. + """ + def __init__(self, x_ixs, y_ixs, ox_ixs=[], oy_ixs=[], p=1): + super(PFSOneShotModel, self).__init__() + self.p_samples = Lambda(lambda x: x[:,:,0]) + self.q_samples = Lambda(lambda x: x[:,:,1]) + + # Feature direction + self.w_constraint = UnitNorm(axis=0) + self.w_layer = Dense(p, use_bias=False, kernel_constraint=self.w_constraint) + + # Z network + n_outer_z = 3 # max(len(x_ixs)//10, 3) + n_inner_z = 5 # max(len(x_ixs)//10, 5) + self.z_layer_1 = Dense(n_outer_z, activation=tf.nn.relu) + self.z_layer_2 = Dense(n_inner_z, activation=tf.nn.relu) + self.z_layer_3 = Dense(n_outer_z, activation=tf.nn.relu) + + # Y network + n_outer_y = 3 # max(len(x_ixs)//100, 3) + n_inner_y = 5 # max(len(x_ixs)//100, 5) + self.y_layer_1 = Dense(n_outer_y, activation=tf.nn.relu) + self.y_layer_2 = Dense(n_inner_y, activation=tf.nn.relu) + self.y_layer_3 = Dense(n_outer_y, activation=tf.nn.relu) + + # Outer quadratic constraints + n_q = 1+n_outer_y+len(oy_ixs)+len(y_ixs)+n_outer_z+len(ox_ixs) + self.quad_load = InitializableDense(n_q) + self.quad_dot = Dot(1) + + # Indices of x and y. + self.x_ixs = x_ixs + self.y_ixs = y_ixs + self.ox_ixs = ox_ixs + self.oy_ixs = oy_ixs + + + def fx(self, u): + ''' ''' + x = tf.gather(u, self.x_ixs, axis=1) + + # z = w^Tx + z = self.w_layer(x) + + # f(z) + fx = self.z_layer_1(z) + fx = self.z_layer_2(fx) + fx = self.z_layer_3(fx) + + if self.ox_ixs: + old_fx = tf.gather(u, self.ox_ixs, axis=1) + fx = concatenate([fx, old_fx], axis=1) + + return fx + + + def gy(self, u): + ''' ''' + y = tf.gather(u, self.y_ixs, axis=1) + gy = self.y_layer_1(y) + gy = self.y_layer_2(gy) + gy = self.y_layer_3(gy) + + if self.oy_ixs: + old_gy = tf.gather(u, self.oy_ixs, axis=1) + gy = concatenate([gy, old_gy], axis=1) + + return gy + + + def statistics(self, u): + ''' + ''' + n = tf.shape(u)[0] + y = tf.gather(u, self.y_ixs, axis=1) + res = tf.zeros(shape=[n, 1], dtype=tf.float64) + + fx = self.fx(u) + gy = self.gy(u) + + # q = [1, f(z), y, g(y)]H[1, f(z), y, g(y)]^T + # Note: the intercept is critical here, do not remove it! + fxgy = concatenate([tf.ones(shape=[n, 1], dtype=tf.float64), fx, y, gy], axis=1) + q = self.quad_load(fxgy) + q = self.quad_dot([q, fxgy]) + + res = tf.add(res, q) + + return res + + + def call(self, inputs): + ''' + ''' + p_samples = self.p_samples(inputs) + t_p = self.statistics(p_samples) + + q_samples = self.q_samples(inputs) + t_q = self.statistics(q_samples) + + t = concatenate([t_p, t_q], axis=1) + t = clip(t, -200., 200.) + return t + + + +class PFSModel(PFSOneShotModel): + """ + Model for Principal Feature Selection. + """ + def __init__(self, x_ixs, y_ixs, ox_ixs=[], oy_ixs=[]): + super(PFSModel, self).__init__(x_ixs, y_ixs, ox_ixs=ox_ixs, oy_ixs=oy_ixs, p=1) + + diff --git a/kxy/pandas_extension/learning_accessor.py b/kxy/pandas_extension/learning_accessor.py index 082a47e..1ef7d15 100644 --- a/kxy/pandas_extension/learning_accessor.py +++ b/kxy/pandas_extension/learning_accessor.py @@ -5,6 +5,7 @@ from .base_accessor import BaseAccessor from ..learning.leanml_predictor import LeanMLPredictor from ..misc.predictors import BorutaPredictor, RFEPredictor, NaivePredictor +from ..pfs.pfs_predictor import PFSPredictor, PCAPredictor @@ -24,7 +25,8 @@ def fit(self, target_column, learner_func, problem_type=None, snr='auto', train_ benchmark_feature=None, missing_value_imputation=False, score='auto', n_down_perf_before_stop=3, \ regression_baseline='mean', additive_learning=False, regression_error_type='additive', return_scores=False, \ start_n_features_perf_frac=0.9, feature_selection_method='leanml', rfe_n_features=None, boruta_pval=0.5, \ - boruta_n_evaluations=20, max_duration=None, val_performance_buffer=0.0, path=None, data_identifier=None): + boruta_n_evaluations=20, max_duration=None, val_performance_buffer=0.0, path=None, data_identifier=None, \ + pca_energy_loss_frac=0.05, pfs_p=None): """ Train a lean boosted supervised learner, bringing in variables one at a time, in decreasing order of importance (as per :code:`df.kxy.variable_selection`), until doing so no longer improves validation performance or another stopping criterion is met. @@ -80,8 +82,8 @@ def fit(self, target_column, learner_func, problem_type=None, snr='auto', train_ When :code:`start_n_features` is not specified, it is set to the number of variables required to achieve a fraction :code:`start_n_features_perf_frac` of the maximum performance achievable (as per :code:`df.kxy.variable_selection`). return_scores : bool (Default False) Whether to return training, validation and testing performance after lean boosting. - feature_selection_method : str (:code:`leanml` | :code:`rfe` | :code:`boruta` | :code:`none`. Default :code:`leanml`) - Do not change this unless you want to try out Boruta or Recursive Feature Selection. The leanml method outperforms both. + feature_selection_method : str (:code:`leanml` | :code:`rfe` | :code:`boruta` | :code:`pfs` | :code:`pca` | :code:`none`. Default :code:`leanml`) + Do not change this unless you want to try out Boruta, Recursive Feature Selection, PCA or Principal Feature Selection. The leanml method outperforms all four. rfe_n_features : int The number of features to keep when the feature selection method is :code:`rfe`. boruta_pval : float @@ -94,6 +96,10 @@ def fit(self, target_column, learner_func, problem_type=None, snr='auto', train_ In LeanML feature selection, this is the threshold by which the new validation performance needs to exceed the previously evaluated validation performance to consider increasing the number of features. score : str | func The validation metric to use to determine if a new feature should be added. When set to :code:`'auto'` (the default), the :math:`R^2` is used for regression problems and the classification accuracy is used for classification problems. Any other string should be the name of a globally accessible callable. + pca_energy_loss_frac : float (Default 0.05) + The maximum fraction of energy (or variance) that left-out principal directions should account for when PCA is the feature selection method chosen. + pfs_p : int | None (Default) + The number of principal features to learn when using PFS. A value that is not :code:`None` automatically selects the one-shot flavor of PFS instead of the PCA-style. Returns @@ -132,6 +138,18 @@ def fit(self, target_column, learner_func, problem_type=None, snr='auto', train_ self.predictor = predictor res['predictor'] = predictor + elif feature_selection_method.lower() == 'pfs': + predictor = PFSPredictor() + res = predictor.fit(self._obj, target_column, learner_func, max_duration=max_duration, path=path, p=pfs_p) + self.predictor = predictor + res['predictor'] = predictor + + elif feature_selection_method.lower() == 'pca': + predictor = PCAPredictor(energy_loss_frac=pca_energy_loss_frac) + res = predictor.fit(self._obj, target_column, learner_func, max_duration=max_duration, path=path) + self.predictor = predictor + res['predictor'] = predictor + else: raise ValueError('The value of feature_selection_method (%s) is not allowed' % feature_selection_method) diff --git a/kxy/pfs/__init__.py b/kxy/pfs/__init__.py new file mode 100644 index 0000000..b788983 --- /dev/null +++ b/kxy/pfs/__init__.py @@ -0,0 +1,21 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Copyright (C) 2022 KXY TECHNOLOGIES, INC. +Author: Dr Yves-Laurent Kom Samo + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU Affero General Public License +along with this program. If not, see . +""" +from .pfs_selector import * +from .pfs_predictor import * \ No newline at end of file diff --git a/kxy/pfs/pfs_predictor.py b/kxy/pfs/pfs_predictor.py new file mode 100644 index 0000000..718ecdd --- /dev/null +++ b/kxy/pfs/pfs_predictor.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import numpy as np +import pandas as pd +import pickle as pkl + +from .pfs_selector import PFS, PCA + + +class PFSPredictor(object): + def _predict(self, obj): + assert hasattr(self, 'models'), 'The model should be first fitted' + assert hasattr(self, 'feature_directions'), 'The model should first be fitted' + assert hasattr(self, 'x_columns'), 'The model should first be fitted' + assert self.feature_directions.shape[0] > 0, 'There should be at least one feature selected' + + z = np.dot(obj[self.x_columns].values, self.feature_directions.T) + y = self.models[0].predict(z) + predictions = pd.DataFrame(index=obj.index) + predictions[self.target_column] = y + + return predictions + + + def predict(self, obj, memory_bound=False): + """ + Make predictions using the fitted model. + + + Parameters + ---------- + obj : pandas.DataFrame + A dataframe containing test explanatory variables/features about which we want to make predictions. + memory_bound : bool (Default False) + Whether we should try to save memory. + + + Returns + ------- + result : pandas.DataFrame + A dataframe with the same index as :code:`obj`, and with one column whose name is the :code:`target_column` used for training. + """ + if memory_bound: + n = obj.shape[0] + max_n = 1000000 + res = pd.DataFrame(index=obj.index) + res[self.target_column] = np.nan + i = 0 + while i < n: + res.iloc[i:i+max_n] = self._predict(obj.iloc[i:i+max_n]) + i += max_n + return res + + else: + return self._predict(obj) + + + def save(self, path): + """ + Cache the predictor to disk. + """ + meta_path = path + '-meta-' + self.__class__.__name__ + meta = {'target_column': self.target_column, 'feature_directions': self.feature_directions, 'x_columns': self.x_columns} + with open(meta_path, 'wb') as f: + pkl.dump(meta, f) + self.models[0].save(path + '-' + self.__class__.__name__) + + + @classmethod + def load(cls, path, learner_func): + """ + Load the predictor from disk. + """ + meta_path = path + '-meta-' + cls.__name__ + with open(meta_path, 'rb') as f: + meta = pkl.load(f) + target_column = meta['target_column'] + feature_directions = meta['feature_directions'] + x_columns = meta['x_columns'] + + n_vars = feature_directions.shape[0] + model = learner_func(n_vars=n_vars, path=path + '-' + cls.__name__, safe=False) + + predictor = cls() + predictor.models = [model] + predictor.feature_directions = feature_directions + predictor.target_column = target_column + predictor.x_columns = x_columns + + return predictor + + + def get_feature_selector(self): + """ + """ + return PFS() + + @property + def p(self): + return self.feature_directions.shape[0] + + + def fit(self, obj, target_column, learner_func, max_duration=None, path=None, p=None): + """ + Fits a supervised learner enriched with feature selection using the Principal Feature Selection (PFS) algorithm. + + + Parameters + ---------- + obj : pandas.DataFrame + A dataframe containing training explanatory variables/features as well as the target. + target_column : str + The name of the column in :code:`obj` containing targets. + learner_func : func | callable + Function or callable that expects one optional argument :code:`n_vars` and returns an instance of a superviser learner (regressor or classifier) following the scikit-learn convention, and expecting :code:`n_vars` features. Specifically, the learner should have a :code:`fit(x_train, y_train)` method. The learner should also have a :code:`feature_importances_` property or attribute, which is an array or a list containing feature importances once the model has been trained. There should be as many importance scores in :code:`feature_importances_` as columns in :code:`fit(x_train, y_train)`. + max_duration : float | None (default) + If not None, then feature elimination will stop after this many seconds. + p : int | None (default) + The number of principal features to learn when using one-shot PFS. + + + Attributes + ---------- + feature_directions : np.array + The matrix whose rows are the directions in which to project the original features to get principal features. + target_column : str + The name of the column used as target. + models : list + An array whose first entry is the fitted model. + x_columns : list + The list of columns used for PFS sorted alphabetically. + + + Returns + ------- + results : dict + A dictionary containing, among other things, feature directions. + + """ + self.target_column = target_column + self.x_columns = sorted([_ for _ in obj.columns if _ != target_column]) + + x = obj[self.x_columns].values + y = obj[[target_column]].values + + # Construct principal features + principal_feature_selector = self.get_feature_selector() + self.feature_directions = principal_feature_selector.fit(x, y, max_duration=max_duration, p=p) + z = np.dot(x, self.feature_directions.T) # Principal features + + # Train the learner + n_vars = self.feature_directions.shape[0] + m = learner_func(n_vars=n_vars) + m.fit(z, y) + self.models = [m] + if path: + self.save(path) + + results = {'Feature Directions': self.feature_directions} + return results + + +class PCAPredictor(PFSPredictor): + def __init__(self, energy_loss_frac=0.05): + self.energy_loss_frac = energy_loss_frac + + def get_feature_selector(self): + return PCA(energy_loss_frac=self.energy_loss_frac) + diff --git a/kxy/pfs/pfs_selector.py b/kxy/pfs/pfs_selector.py new file mode 100644 index 0000000..b939a7e --- /dev/null +++ b/kxy/pfs/pfs_selector.py @@ -0,0 +1,174 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +from time import time +import logging +import numpy as np + +from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN +from tensorflow.keras.optimizers import Adam + +from kxy.misc.tf import PFSLearner, PFSOneShotLearner + + + + +def learn_principal_direction(y, x, ox=None, oy=None): + """ + Learn the i-th principal feature when using :math:`x` to predict :math:`y`. + + Parameters + ---------- + x : np.array + 2D array of shape :math:`(n, d)` containing original features. + y : np.array + Array of shape :math:`(n)` or :math:`(n, 1)` containing targets. + + Returns + ------- + w : np.array + The first principal direction. + mi: float + The mutual information :math:`I(y; w_i^Tx, \\dots, w_1^Tx)`. + """ + dx = 1 if len(x.shape) == 1 else x.shape[1] + dy = 1 if len(y.shape) == 1 else y.shape[1] + dox = 0 if ox is None else 1 if len(ox.shape) == 1 else ox.shape[1] + doy = 0 if oy is None else 1 if len(oy.shape) == 1 else oy.shape[1] + + learner = PFSLearner(dx, dy=dy, dox=dox, doy=doy) + learner.fit(x, y, ox=ox, oy=oy) + + mi = learner.mutual_information + w = learner.feature_direction + ox = learner.fx + oy = learner.gy + + return w, mi, ox, oy + + + +def learn_principal_directions_one_shot(y, x, p): + """ + Jointly learn p principal features. + + Parameters + ---------- + x : np.array + 2D array of shape :math:`(n, d)` containing original features. + y : np.array + Array of shape :math:`(n)` or :math:`(n, 1)` containing targets. + p : int + The number of principal features to learn. + + Returns + ------- + w : np.array + The matrix whose rows are the p principal directions. + """ + dx = 1 if len(x.shape) == 1 else x.shape[1] + learner = PFSOneShotLearner(dx, p=p) + learner.fit(x, y) + w = learner.feature_directions + + return w + + + + +class PFS(object): + def fit(self, x, y, p=None, mi_tolerance=0.0001, max_duration=None): + """ + Perform Principal Feature Selection using :math:`x` to predict :math:`y`. + + Specifically, we are looking for a :math:`p x d` matrix :math:`W` whose :math:`p` rows are learned sequentially such that :math:`z := Wx` is a great feature vector for predicting :math:`y`. + + Each row of :math:`W` is normal: :math:`||w_i||=1`, and the corresponding principal feature, namely :math:`w_i^Tx`, points in the same direction as :math:`y` (i.e. :math:`Cov(y, w_i^Tx) > 0`). + + The first row :math:`w_1` is learned so as to maximize the mutual information :math:`I(y; x^Tw_1)`. + + The second row :math:`w_2` is learned so as to maximize the conditional mutual information :math:`I(y; x^Tw_2 | x^Tw_1)`. + + More generally, the :math:`(i+1)`-th row :math:`w_{i+1}` is learned so as to maximize the conditional mutual information :math:`I(y; x^Tw_{i+1} | [x^Tw_1, ..., x^Tw_i])`. + + + Parameters + ---------- + x : np.array + 2D array of shape :math:`(n, d)` containing original features. + y : np.array + Array of shape :math:`(n)` or :math:`(n, 1)` containing targets. + p : int | None (default) + The number of features to select. When :code:`None` (the default) we stop when the estimated mutual information smaller than the mutual information tolerance parameter, or when we have exceeded the maximum duration. A value of :code:`p` that is not :code:`None` triggers one-shot PFS. + mi_tolerance: float + The smallest estimated mutual information required to keep looking for new feature directions. + max_duration : float | None (default) + The maximum amount of time (in second) to allocate to PFS. + + + Returns + ------- + W : np.array + 2D array whose rows are directions to use to compute principal features: :math:`z = Wx`. + """ + if max_duration: + start_time = time() + + rows = [] + d = 1 if len(x.shape) == 1 else x.shape[1] + if p is None: + t = y.flatten().copy() + old_mi = 0.0 + ox = None + oy = None + for i in range(d): + w, mi, ox, oy = learn_principal_direction(t, x, ox=ox, oy=oy) + + if mi-old_mi < mi_tolerance: + logging.info('The mutual information %.4f after %d round has not increase by more than %.4f: stopping.' % ( + mi, i+1, mi_tolerance)) + break + else: + logging.info('The mutual information has increased from %.4f to %.4f after %d rounds.' % (old_mi, mi, i+1)) + rows += [w.copy()] + + if max_duration: + if time()-start_time > max_duration: + logging.info('PFS has exceeded the configured maximum duration: exiting.') + break + + old_mi = mi + + if rows == []: + logging.warning('The only principal feature selected is not informative about the target: I(y; w^Tx)=%.4f' % mi) + rows += [w.copy()] + + self.feature_directions = np.array(rows) + else: + # Learn all p principal features jointly. + self.feature_directions = learn_principal_directions_one_shot(y, x, p) + + return self.feature_directions + + + +class PCA(object): + def __init__(self, energy_loss_frac=0.05): + self.energy_loss_frac = energy_loss_frac + + + def fit(self, x, _, max_duration=None, p=None): + """ + """ + cov_x = np.cov(x.T) # Columns in x should represent variables and rows observations. + u, d, v = np.linalg.svd(cov_x) + cum_energy = np.cumsum(d) + energy = cum_energy[-1] + p = len([_ for _ in cum_energy if _ <= (1.-self.energy_loss_frac)*energy]) + + self.feature_directions = u[:, :p].T + + return self.feature_directions + + + + diff --git a/setup.py b/setup.py index 277f826..09d545b 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ with open('README.md') as f: long_description = f.read() -version = "1.4.3" +version = "1.4.4" setup(name="kxy", version=version, zip_safe=False, diff --git a/tests/test_learning.py b/tests/test_learning.py index 085f9d8..1e06ba3 100644 --- a/tests/test_learning.py +++ b/tests/test_learning.py @@ -406,7 +406,6 @@ def test_non_additive_lean_boosted_classifier(): results = features_df.kxy.fit(target_column, lightgbm_classifier_cls, \ problem_type='classification', additive_learning=False, return_scores=True, \ n_down_perf_before_stop=1) - assert results['Testing Accuracy'] == '0.964' assert results['Selected Variables'] == ['Variance', 'Skewness.ABS(* - Q25(*))', 'Kurtosis', 'Skewness', 'Entropy'] @@ -425,7 +424,6 @@ def test_autogluon(): results = features_df.kxy.fit(target_column, autogluon_learner_func, \ problem_type='classification', additive_learning=False, return_scores=True, \ n_down_perf_before_stop=1) - assert results['Testing Accuracy'] == '1.000' assert results['Selected Variables'] == ['Variance', 'Skewness.ABS(* - Q25(*))', 'Kurtosis', 'Skewness'] diff --git a/tests/test_pca.py b/tests/test_pca.py new file mode 100644 index 0000000..265dbaa --- /dev/null +++ b/tests/test_pca.py @@ -0,0 +1,111 @@ +import numpy as np + +import kxy +from kxy.learning import get_sklearn_learner, get_lightgbm_learner_learning_api, get_xgboost_learner +from kxy.pfs import PCAPredictor, PCA +from kxy_datasets.regressions import Abalone +from kxy_datasets.classifications import BankNote, BankMarketing + + +def test_shape(): + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + y = features_df[target_column].values + x_columns = [_ for _ in features_df.columns if _ != target_column] + x = features_df[x_columns].values + + # Principal features construction + feature_directions = PCA().fit(x, y) + assert feature_directions.shape[1] == x.shape[1] + + predictor = PCAPredictor() + learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) + results = predictor.fit(features_df, target_column, learner_func) + feature_directions = results['Feature Directions'] + assert feature_directions.shape[1] == x.shape[1] + + +def test_orthonormality(): + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + y = features_df[target_column].values + x_columns = [_ for _ in features_df.columns if _ != target_column] + x = features_df[x_columns].values + + # Principal features construction + feature_directions = PCA().fit(x, y) + n_directions = feature_directions.shape[0] + for i in range(n_directions): + assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) + for j in range(n_directions): + if j != i: + assert np.abs(np.dot(feature_directions[i, :], feature_directions[j, :])) < 1e-7 + + predictor = PCAPredictor() + learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) + results = predictor.fit(features_df, target_column, learner_func) + feature_directions = results['Feature Directions'] + n_directions = feature_directions.shape[0] + for i in range(n_directions): + assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) + + + + + +def test_pca_feature_selection(): + # Regression + xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + + # Model building + results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ + problem_type='regression', feature_selection_method='pfs') + assert results['Feature Directions'].shape[1] == features_df.shape[1]-1 + predictor = results['predictor'] + predictions = predictor.predict(features_df) + assert len(predictions.columns) == 1 + assert target_column in predictions.columns + assert set(features_df.index).difference(set(predictions.index)) == set() + assert set(predictions.index).difference(set(features_df.index)) == set() + + +def test_save_pca(): + # Regression + xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + + # Model building + path = 'Abalone' + results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ + problem_type='regression', feature_selection_method='pca', \ + path=path) + loaded_predictor = PCAPredictor().load(path, xgboost_regressor_cls) + feature_directions = loaded_predictor.feature_directions + assert feature_directions.shape[1] == features_df.shape[1]-1 + predictions = loaded_predictor.predict(features_df) + assert len(predictions.columns) == 1 + assert target_column in predictions.columns + assert set(features_df.index).difference(set(predictions.index)) == set() + assert set(predictions.index).difference(set(features_df.index)) == set() + + + diff --git a/tests/test_pfs.py b/tests/test_pfs.py new file mode 100644 index 0000000..e760793 --- /dev/null +++ b/tests/test_pfs.py @@ -0,0 +1,131 @@ +import numpy as np + +import kxy +from kxy.learning import get_sklearn_learner, get_lightgbm_learner_learning_api, get_xgboost_learner +from kxy.pfs import PFSPredictor, PFS, PCA +from kxy_datasets.regressions import Abalone +from kxy_datasets.classifications import BankNote, BankMarketing + + +def test_shape(): + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + y = features_df[target_column].values + x_columns = [_ for _ in features_df.columns if _ != target_column] + x = features_df[x_columns].values + + # Principal features construction + feature_directions = PFS().fit(x, y) + assert feature_directions.shape[1] == x.shape[1] + + predictor = PFSPredictor() + learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) + results = predictor.fit(features_df, target_column, learner_func) + feature_directions = results['Feature Directions'] + assert feature_directions.shape[1] == x.shape[1] + + +def test_norm(): + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + y = features_df[target_column].values + x_columns = [_ for _ in features_df.columns if _ != target_column] + x = features_df[x_columns].values + + # Principal features construction + feature_directions = PFS().fit(x, y) + n_directions = feature_directions.shape[0] + for i in range(n_directions): + assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) + + predictor = PFSPredictor() + learner_func = get_sklearn_learner('sklearn.ensemble.RandomForestRegressor', random_state=0) + results = predictor.fit(features_df, target_column, learner_func) + feature_directions = results['Feature Directions'] + n_directions = feature_directions.shape[0] + for i in range(n_directions): + assert np.allclose(np.dot(feature_directions[i, :], feature_directions[i, :]), 1.) + + +def test_pfs_feature_selection(): + # Regression + xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + + # Model building + results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ + problem_type='regression', feature_selection_method='pfs') + assert results['Feature Directions'].shape[1] == features_df.shape[1]-1 + predictor = results['predictor'] + predictions = predictor.predict(features_df) + assert len(predictions.columns) == 1 + assert target_column in predictions.columns + assert set(features_df.index).difference(set(predictions.index)) == set() + assert set(predictions.index).difference(set(features_df.index)) == set() + + +def test_save_pfs(): + # Regression + xgboost_regressor_cls = get_xgboost_learner('xgboost.XGBRegressor') + dataset = Abalone() + target_column = dataset.y_column + df = dataset.df + + # Features generation + features_df = df.kxy.generate_features(entity=None, max_lag=None, entity_name='*', exclude=[target_column]) + + # Model building + path = 'Abalone' + results = features_df.kxy.fit(target_column, xgboost_regressor_cls, \ + problem_type='regression', feature_selection_method='pfs', \ + path=path) + loaded_predictor = PFSPredictor().load(path, xgboost_regressor_cls) + feature_directions = loaded_predictor.feature_directions + assert feature_directions.shape[1] == features_df.shape[1]-1 + predictions = loaded_predictor.predict(features_df) + assert len(predictions.columns) == 1 + assert target_column in predictions.columns + assert set(features_df.index).difference(set(predictions.index)) == set() + assert set(predictions.index).difference(set(features_df.index)) == set() + + +def test_pfs_accuracy(): + # Generate the data + d = 100 + w = np.ones(d)/d + x = np.random.randn(10000, d) + xTw = np.dot(x, w) + y = xTw + 2.*xTw**2 + 0.5*xTw**3 + + # Run PFS + selector = PFS() + selector.fit(x, y) + + # Learned principal directions + F = selector.feature_directions + + # Learned principal features + z = np.dot(x, F.T) + + # Accuracy + true_f_1 = w/np.linalg.norm(w) + learned_f_1 = F[0, :] + e = np.linalg.norm(true_f_1-learned_f_1) + + assert e <= 0.10 + + +