Skip to content

Commit

Permalink
Adding Principal Feature Selection
Browse files Browse the repository at this point in the history
  • Loading branch information
Yves-Laurent committed Apr 8, 2022
1 parent 861f606 commit 869db0d
Show file tree
Hide file tree
Showing 21 changed files with 1,276 additions and 260 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Expand Up @@ -147,3 +147,5 @@ UCI*/
local_*.py
*do-not-commit*
AutogluonModels*
*-PFSPredictor
*-PCAPredictor
Empty file added CHANGELOG.md
Empty file.
2 changes: 1 addition & 1 deletion CITATION.cff
Expand Up @@ -5,7 +5,7 @@ authors:
given-names: Yves-Laurent
orcid: "https://orcid.org/0000-0003-2901-6930"
title: KXY: A Seemless API to 10x The Productivity of Machine Learning Engineers.
version: 1.2.33
version: 1.4.3
date-released: "2021-10-12"
abstract: KXY is a powerful serverless analysis toolkit that takes trial-and-error out of machine learning projects.
url: "https://github.com/kxytechnologies/kxy-python"
Expand Down
19 changes: 16 additions & 3 deletions Makefile
@@ -1,4 +1,4 @@
VERSION = 1.4.2
VERSION = 1.4.4

# Update the s3 bucket of the docs website
deploy_docs:
Expand All @@ -25,9 +25,11 @@ docker_release:

docker_release_github:
docker build -t ghcr.io/kxytechnologies/kxy-python:latest ./docker/kxy/
echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest
# echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest
docker push ghcr.io/kxytechnologies/kxy-python:latest
docker build -t ghcr.io/kxytechnologies/kxy-python:$(VERSION) ./docker/kxy/
echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)
# echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)
docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)


one_shot_release:
Expand All @@ -45,6 +47,17 @@ update_docs:
make refresh_web PATHS=/reference/*


github_release:
gh release create v$(VERSION) -F CHANGELOG.md


package_release:
make pypi_release
make github_release
make docker_release_github
make docker_release


osr:
make one_shot_release

Expand Down
4 changes: 2 additions & 2 deletions kxy/__init__.py
Expand Up @@ -4,7 +4,7 @@
__author__ = "Dr. Yves-Laurent Kom Samo"
__copyright__ = "Copyright (C) 2022 KXY Technologies, Inc."
__license__ = """
Copyright (C) 2021 KXY TECHNOLOGIES, INC.
Copyright (C) 2022 KXY TECHNOLOGIES, INC.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
Expand All @@ -19,7 +19,7 @@
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
__version__ = "1.4.3"
__version__ = "1.4.4"

from kxy.api import *
from kxy.pre_learning import *
Expand Down
15 changes: 8 additions & 7 deletions kxy/examples/numerai_example.py
Expand Up @@ -22,7 +22,7 @@
####################
# Train/Test Split #
####################
random_seed = 0
random_seed = 2
test_df = df.sample(frac=0.8, random_state=random_seed)
train_df = df.drop(test_df.index)
train_features = train_df[feature_columns]
Expand All @@ -40,15 +40,16 @@

# Lean boosting fit
results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
problem_type=problem_type, feature_selection_method='leanml', \
data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed, \
snr='high')
problem_type=problem_type, feature_selection_method='pfs', pfs_p=100, \
data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed)

predictor = results['predictor']
selected_features = predictor.selected_variables
p = predictor.feature_directions.shape[0]
print('Number of features: %d' % p)

print('Selected Variables')
print(selected_features)
# selected_features = predictor.selected_variables
# print('Selected Variables')
# print(selected_features)

# Training/Testing Predictions
train_predictions = predictor.predict(train_features)
Expand Down
242 changes: 1 addition & 241 deletions kxy/misc/mind.py
@@ -1,253 +1,13 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
TensorFlow Implementation of MIND ([1]) under Spearman rank correlation constraints.
[1] Kom Samo, Y. (2021). Inductive Mutual Information Estimation: A Convex Maximum-Entropy Copula Approach . <i>Proceedings of The 24th International Conference on Artificial Intelligence and Statistics</i>, in <i>Proceedings of Machine Learning Research</i> 130:2242-2250 Available from https://proceedings.mlr.press/v130/kom-samo21a.html.
"""
import logging
logging.basicConfig(level=logging.INFO)

from multiprocessing import Pool, cpu_count
import numpy as np
import tensorflow as tf
tf.keras.backend.set_floatx('float64')
tf.config.threading.set_inter_op_parallelism_threads(2)
tf.config.threading.set_intra_op_parallelism_threads(8)
tf.config.set_soft_device_placement(True)

from tensorflow.keras import Model
from tensorflow.keras.backend import pow as tf_pow
from tensorflow.keras.backend import cast, clip
from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN
from tensorflow.keras.layers import Dense, Lambda, concatenate, Dot, Dropout, Layer
from tensorflow.keras.losses import Loss
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import Sequence
from tensorflow.python.ops import math_ops

rankdata = lambda x: 1.+np.argsort(np.argsort(x, axis=0), axis=0)



class CopulaBatchGenerator(Sequence):
'''
Random batch generator.
'''
def __init__(self, z, batch_size=1000, steps_per_epoch=100):
self.batch_size = batch_size
self.d = z.shape[1]
self.n = z.shape[0]
self.z = z
self.steps_per_epoch = steps_per_epoch
self.emp_u = rankdata(self.z)/(self.n + 1.)
self.emp_u[np.isnan(self.z)] = 0.5

if self.n < 200*self.d:
dn = 200*self.d - self.n
selected_rows = np.random.choice(self.n, dn, replace=True)
emp_u = self.emp_u[selected_rows, :].copy()
scale = 1./(100.*self.n)
emp_u += (scale*np.random.rand(*emp_u.shape) - 0.5*scale)
self.emp_u = np.concatenate([self.emp_u, emp_u], axis=0)
self.n = self.emp_u.shape[0]

self.batch_selector = np.random.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True)
self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size))


def getitem_ndarray(self, idx):
''' '''
i = idx % self.steps_per_epoch
selected_rows = self.batch_selector[i]
emp_u_ = self.emp_u[selected_rows, :]
z_p = emp_u_.copy()
z_q = np.random.rand(*emp_u_.shape)

z = np.empty((self.batch_size, self.d, 2))
z[:, :, 0] = z_p
z[:, :, 1] = z_q
batch_x = z
batch_y = np.ones((self.batch_size, 2)) # Not used
return batch_x, batch_y


def __getitem__(self, idx):
''' '''
batch_x, batch_y = self.getitem_ndarray(idx)
return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y)


def __len__(self):
return self.steps_per_epoch



class InitializableDense(Layer):
'''
'''
def __init__(self, units, initial_w=None, initial_b=None, bias=False):
'''
initial_w should be None or a 2D numpy array.
initial_b should be None or a 1D numpy array.
'''
super(InitializableDense, self).__init__()
self.units = units
self.with_bias = bias
self.w_initializer = 'zeros' if initial_w is None else tf.constant_initializer(initial_w)

if self.with_bias:
self.b_initializer = 'zeros' if initial_b is None else tf.constant_initializer(initial_b)


def build(self, input_shape):
''' '''
self.w = self.add_weight(shape=(input_shape[-1], self.units), \
initializer=self.w_initializer, trainable=True, name='quad_w')

if self.with_bias:
self.b = self.add_weight(shape=(self.units,), \
initializer=self.b_initializer, trainable=True, name='quad_b')


def call(self, inputs):
''' '''
return tf.matmul(inputs, self.w)+self.b if self.with_bias else tf.matmul(inputs, self.w)




class CopulaModel(Model):
'''
Maximum-entropy copula under (possibly sparse) Spearman rank correlation constraints.
'''
def __init__(self, d, subsets=[]):
super(CopulaModel, self).__init__()
self.d = d
if subsets == []:
subsets = [[_ for _ in range(d)]]

self.subsets = subsets
self.n_subsets = len(self.subsets)
self.p_samples = Lambda(lambda x: x[:,:,0])
self.q_samples = Lambda(lambda x: x[:,:,1])

self.fx_non_mon_layer_1s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)]
self.fx_non_mon_layer_2s = [Dense(5, activation=tf.nn.relu) for _ in range(self.n_subsets)]
self.fx_non_mon_layer_3s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)]
self.fx_non_mon_layer_4s = [Dense(1) for _ in range(self.n_subsets)]

eff_ds = [len(subset)+1 for subset in self.subsets]
self.spears = [InitializableDense(eff_d) for eff_d in eff_ds]
self.dots = [Dot(1) for _ in range(self.n_subsets)]

# Mixing layers
self.mixing_layer1 = Dense(5, activation=tf.nn.relu)
self.mixing_layer2 = Dense(5, activation=tf.nn.relu)
self.mixing_layer3 = Dense(1)


def subset_statistics(self, u, i):
'''
Statistics function for the i-th subset of variables.
'''
n = tf.shape(u)[0]
res = tf.zeros(shape=[n, 1], dtype=tf.float64)
ui = tf.gather(u, self.subsets[i], axis=1)

# Constraints beyond quadratic
fui = self.fx_non_mon_layer_1s[i](ui)
fui = self.fx_non_mon_layer_2s[i](fui)
fui = self.fx_non_mon_layer_3s[i](fui)
fui = self.fx_non_mon_layer_4s[i](fui)
ui = concatenate([ui, fui], axis=1)

# Spearman terms
spearman_term = self.spears[i](ui)
spearman_term = self.dots[i]([spearman_term, ui])
res = tf.add(res, spearman_term)

return res


def statistics(self, u):
'''
Statistics function.
'''
if self.n_subsets > 1:
ts = [self.subset_statistics(u, i) for i in range(self.n_subsets)]
t = concatenate(ts, axis=1)
t = self.mixing_layer1(t)
t = self.mixing_layer2(t)
t = self.mixing_layer3(t)
else:
t = self.subset_statistics(u, 0)

return t


def call(self, inputs):
''' '''
p_samples = self.p_samples(inputs)
t_p = self.statistics(p_samples)

q_samples = self.q_samples(inputs)
t_q = self.statistics(q_samples)

t = concatenate([t_p, t_q], axis=1)
t = clip(t, -100., 100.)

return t


def copula(self, inputs):
''' '''
u = tf.constant(inputs)
c = math_ops.exp(self.statistics(u))
return c.numpy()/c.numpy().mean()




class MINDLoss(Loss):
'''
Loss function.
'''
def call(self, y_true, y_pred):
''' '''
p_samples = y_pred[:, 0]
q_samples = y_pred[:, 1]
mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(math_ops.exp(q_samples)))
return mi



class CopulaLearner(object):
'''
Maximum-entropy learner.
'''
def __init__(self, d, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \
name='Adam', lr=0.01, subsets=[]):
self.d = d
self.model = CopulaModel(self.d, subsets=subsets)
self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \
name=name, lr=lr)
self.loss = MINDLoss()
self.model.compile(optimizer=self.opt, loss=self.loss)
self.copula_entropy = None


def fit(self, z, batch_size=10000, steps_per_epoch=1000):
''' '''
epochs = 20
z_gen = CopulaBatchGenerator(z, batch_size=batch_size, steps_per_epoch=steps_per_epoch)
self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \
callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()])
self.copula_entropy = self.model.evaluate(z_gen)



from kxy.misc.tf import CopulaLearner

def copula_entropy(z, subsets=[]):
'''
Expand Down
24 changes: 24 additions & 0 deletions kxy/misc/tf/__init__.py
@@ -0,0 +1,24 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Copyright (C) 2022 KXY TECHNOLOGIES, INC.
Author: Dr Yves-Laurent Kom Samo
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
from .generators import *
from .layers import *
from .losses import *
from .models import *
from .learners import *

0 comments on commit 869db0d

Please sign in to comment.