Adding Principal Feature Selection

kxytechnologies · Apr 8, 2022 · 869db0d · 869db0d
1 parent 861f606
commit 869db0d
Show file tree

Hide file tree

Showing 21 changed files with 1,276 additions and 260 deletions.
diff --git a/.gitignore b/.gitignore
@@ -147,3 +147,5 @@ UCI*/
 local_*.py
 *do-not-commit*
 AutogluonModels*
+*-PFSPredictor
+*-PCAPredictor
diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/CITATION.cff b/CITATION.cff
@@ -5,7 +5,7 @@ authors:
     given-names: Yves-Laurent
     orcid: "https://orcid.org/0000-0003-2901-6930"
 title: KXY: A Seemless API to 10x The Productivity of Machine Learning Engineers.
-version: 1.2.33
+version: 1.4.3
 date-released: "2021-10-12"
 abstract: KXY is a powerful serverless analysis toolkit that takes trial-and-error out of machine learning projects.
 url: "https://github.com/kxytechnologies/kxy-python"

diff --git a/Makefile b/Makefile
@@ -1,4 +1,4 @@
-VERSION = 1.4.2
+VERSION = 1.4.4
 
 # Update the s3 bucket of the docs website
 deploy_docs:
@@ -25,9 +25,11 @@ docker_release:
 
 docker_release_github:
 	docker build -t ghcr.io/kxytechnologies/kxy-python:latest ./docker/kxy/
-	echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest
+	# echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:latest
+	docker push ghcr.io/kxytechnologies/kxy-python:latest
 	docker build -t ghcr.io/kxytechnologies/kxy-python:$(VERSION) ./docker/kxy/
-	echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)
+	# echo $(CR_PAT) | docker login ghcr.io -u USERNAME --password-stdin && docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)
+	docker push ghcr.io/kxytechnologies/kxy-python:$(VERSION)
 
 
 one_shot_release:
@@ -45,6 +47,17 @@ update_docs:
 	make refresh_web PATHS=/reference/*
 
 
+github_release:
+	gh release create v$(VERSION) -F CHANGELOG.md
+
+
+package_release:
+	make pypi_release
+	make github_release
+	make docker_release_github
+	make docker_release
+
+
 osr:
 	make one_shot_release
 

diff --git a/kxy/__init__.py b/kxy/__init__.py
@@ -4,7 +4,7 @@
 __author__ = "Dr. Yves-Laurent Kom Samo"
 __copyright__ = "Copyright (C) 2022 KXY Technologies, Inc."
 __license__ = """
-Copyright (C) 2021 KXY TECHNOLOGIES, INC.
+Copyright (C) 2022 KXY TECHNOLOGIES, INC.
 
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -19,7 +19,7 @@
 You should have received a copy of the GNU Affero General Public License
 along with this program.  If not, see <https://www.gnu.org/licenses/>.
 """
-__version__ = "1.4.3"
+__version__ = "1.4.4"
 
 from kxy.api import *
 from kxy.pre_learning import *

diff --git a/kxy/examples/numerai_example.py b/kxy/examples/numerai_example.py
@@ -22,7 +22,7 @@
 ####################
 # Train/Test Split #
 ####################
-random_seed = 0
+random_seed = 2
 test_df = df.sample(frac=0.8, random_state=random_seed)
 train_df = df.drop(test_df.index)
 train_features = train_df[feature_columns]
@@ -40,15 +40,16 @@
 
 # Lean boosting fit
 results = train_df.kxy.fit(target_column, lightgbm_regressor_learner_cls, \
-    problem_type=problem_type, feature_selection_method='leanml', \
-    data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed, \
-    snr='high')
+    problem_type=problem_type, feature_selection_method='pfs', pfs_p=100, \
+    data_identifier='numerai_training_data_int8_train_seed_%d.parquet.gzip' % random_seed)
 
 predictor = results['predictor']
-selected_features = predictor.selected_variables
+p = predictor.feature_directions.shape[0]
+print('Number of features: %d' % p)
 
-print('Selected Variables')
-print(selected_features)
+# selected_features = predictor.selected_variables
+# print('Selected Variables')
+# print(selected_features)
 
 # Training/Testing Predictions
 train_predictions = predictor.predict(train_features)

diff --git a/kxy/misc/mind.py b/kxy/misc/mind.py
@@ -1,253 +1,13 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
-
 """
 TensorFlow Implementation of MIND ([1]) under Spearman rank correlation constraints.
 
 [1] Kom Samo, Y. (2021). Inductive Mutual Information Estimation: A Convex Maximum-Entropy Copula Approach . <i>Proceedings of The 24th International Conference on Artificial Intelligence and Statistics</i>, in <i>Proceedings of Machine Learning Research</i> 130:2242-2250 Available from https://proceedings.mlr.press/v130/kom-samo21a.html.
 """
-import logging
-logging.basicConfig(level=logging.INFO)
-
-from multiprocessing import Pool, cpu_count
 import numpy as np
-import tensorflow as tf
-tf.keras.backend.set_floatx('float64')
-tf.config.threading.set_inter_op_parallelism_threads(2)
-tf.config.threading.set_intra_op_parallelism_threads(8)
-tf.config.set_soft_device_placement(True)
-
-from tensorflow.keras import Model
-from tensorflow.keras.backend import pow as tf_pow
-from tensorflow.keras.backend import cast, clip
-from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN
-from tensorflow.keras.layers import Dense, Lambda, concatenate, Dot, Dropout, Layer
-from tensorflow.keras.losses import Loss
-from tensorflow.keras.optimizers import Adam
-from tensorflow.keras.utils import Sequence
-from tensorflow.python.ops import math_ops
-
-rankdata = lambda x: 1.+np.argsort(np.argsort(x, axis=0), axis=0)
-
-
-
-class CopulaBatchGenerator(Sequence):
-	''' 
-	Random batch generator.
-	'''
-	def __init__(self, z, batch_size=1000, steps_per_epoch=100):
-		self.batch_size = batch_size
-		self.d = z.shape[1]
-		self.n = z.shape[0]
-		self.z = z
-		self.steps_per_epoch = steps_per_epoch
-		self.emp_u = rankdata(self.z)/(self.n + 1.)
-		self.emp_u[np.isnan(self.z)] = 0.5
-
-		if self.n < 200*self.d:
-			dn = 200*self.d - self.n
-			selected_rows = np.random.choice(self.n, dn, replace=True)
-			emp_u = self.emp_u[selected_rows, :].copy()
-			scale = 1./(100.*self.n)
-			emp_u += (scale*np.random.rand(*emp_u.shape) - 0.5*scale)
-			self.emp_u = np.concatenate([self.emp_u, emp_u], axis=0)
-			self.n = self.emp_u.shape[0]
-
-		self.batch_selector = np.random.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True)
-		self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size))
-
-
-	def getitem_ndarray(self, idx):
-		''' '''
-		i = idx % self.steps_per_epoch
-		selected_rows = self.batch_selector[i]
-		emp_u_ = self.emp_u[selected_rows, :]
-		z_p = emp_u_.copy()
-		z_q = np.random.rand(*emp_u_.shape)
-
-		z = np.empty((self.batch_size, self.d, 2))
-		z[:, :, 0] = z_p
-		z[:, :, 1] = z_q
-		batch_x = z
-		batch_y = np.ones((self.batch_size, 2))  # Not used  
-		return batch_x, batch_y
-
-
-	def __getitem__(self, idx):
-		''' '''
-		batch_x, batch_y = self.getitem_ndarray(idx)
-		return tf.convert_to_tensor(batch_x), tf.convert_to_tensor(batch_y)
-
-
-	def __len__(self):
-		return self.steps_per_epoch
-
-
-
-class InitializableDense(Layer):
-	''' 
-	'''
-	def __init__(self, units, initial_w=None, initial_b=None, bias=False):
-		'''
-		initial_w should be None or a 2D numpy array.
-		initial_b should be None or a 1D numpy array.
-		'''
-		super(InitializableDense, self).__init__()
-		self.units = units
-		self.with_bias = bias
-		self.w_initializer = 'zeros' if initial_w is None else tf.constant_initializer(initial_w)
-
-		if self.with_bias:
-			self.b_initializer = 'zeros' if initial_b is None else tf.constant_initializer(initial_b)
-
-
-	def build(self, input_shape):
-		''' '''
-		self.w = self.add_weight(shape=(input_shape[-1], self.units), \
-			initializer=self.w_initializer, trainable=True, name='quad_w')
-
-		if self.with_bias:
-			self.b = self.add_weight(shape=(self.units,), \
-				initializer=self.b_initializer, trainable=True, name='quad_b')
-
-
-	def call(self, inputs):
-		''' '''
-		return tf.matmul(inputs, self.w)+self.b if self.with_bias else tf.matmul(inputs, self.w)
-
-
-
-
-class CopulaModel(Model):
-	'''
-	Maximum-entropy copula under (possibly sparse) Spearman rank correlation constraints.
-	'''
-	def __init__(self, d, subsets=[]):
-		super(CopulaModel, self).__init__()
-		self.d = d
-		if subsets == []:
-			subsets = [[_ for _ in range(d)]]
-
-		self.subsets = subsets
-		self.n_subsets = len(self.subsets)
-		self.p_samples = Lambda(lambda x: x[:,:,0])
-		self.q_samples = Lambda(lambda x: x[:,:,1])
-
-		self.fx_non_mon_layer_1s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)]
-		self.fx_non_mon_layer_2s = [Dense(5, activation=tf.nn.relu) for _ in range(self.n_subsets)]
-		self.fx_non_mon_layer_3s = [Dense(3, activation=tf.nn.relu) for _ in range(self.n_subsets)]
-		self.fx_non_mon_layer_4s = [Dense(1) for _ in range(self.n_subsets)]
-
-		eff_ds = [len(subset)+1 for subset in self.subsets]
-		self.spears = [InitializableDense(eff_d) for eff_d in eff_ds]
-		self.dots = [Dot(1) for _ in range(self.n_subsets)]
-
-		# Mixing layers
-		self.mixing_layer1 = Dense(5, activation=tf.nn.relu)
-		self.mixing_layer2 = Dense(5, activation=tf.nn.relu)
-		self.mixing_layer3 = Dense(1)
-
-
-	def subset_statistics(self, u, i):
-		'''
-		Statistics function for the i-th subset of variables.
-		''' 
-		n = tf.shape(u)[0]
-		res = tf.zeros(shape=[n, 1], dtype=tf.float64)
-		ui = tf.gather(u, self.subsets[i], axis=1)
-
-		# Constraints beyond quadratic
-		fui = self.fx_non_mon_layer_1s[i](ui)
-		fui = self.fx_non_mon_layer_2s[i](fui)
-		fui = self.fx_non_mon_layer_3s[i](fui)
-		fui = self.fx_non_mon_layer_4s[i](fui)
-		ui = concatenate([ui, fui], axis=1)
-
-		# Spearman terms
-		spearman_term = self.spears[i](ui)
-		spearman_term = self.dots[i]([spearman_term, ui])
-		res = tf.add(res, spearman_term)
-
-		return res
-
-
-	def statistics(self, u):
-		'''
-		Statistics function.
-		''' 
-		if self.n_subsets > 1:
-			ts = [self.subset_statistics(u, i) for i in range(self.n_subsets)]
-			t = concatenate(ts, axis=1)
-			t = self.mixing_layer1(t)
-			t = self.mixing_layer2(t)
-			t = self.mixing_layer3(t)
-		else:
-			t = self.subset_statistics(u, 0)
-
-		return t
-
-
-	def call(self, inputs):
-		''' '''        
-		p_samples = self.p_samples(inputs)
-		t_p = self.statistics(p_samples)
-
-		q_samples = self.q_samples(inputs)
-		t_q = self.statistics(q_samples)
-
-		t = concatenate([t_p, t_q], axis=1)
-		t = clip(t, -100., 100.)
-
-		return t
-
-
-	def copula(self, inputs):
-		''' '''
-		u = tf.constant(inputs)
-		c = math_ops.exp(self.statistics(u))
-		return c.numpy()/c.numpy().mean()
-
-
-
-
-class MINDLoss(Loss):  
-	'''
-	Loss function.
-	'''
-	def call(self, y_true, y_pred):
-		''' '''
-		p_samples = y_pred[:, 0]
-		q_samples = y_pred[:, 1]
-		mi = -tf.reduce_mean(p_samples) + math_ops.log(tf.reduce_mean(math_ops.exp(q_samples)))
-		return mi
-
-
-
-class CopulaLearner(object):
-	'''
-	Maximum-entropy learner.
-	'''
-	def __init__(self, d, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \
-			name='Adam', lr=0.01, subsets=[]):
-		self.d = d
-		self.model = CopulaModel(self.d, subsets=subsets)
-		self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \
-			name=name, lr=lr)
-		self.loss = MINDLoss()
-		self.model.compile(optimizer=self.opt, loss=self.loss)
-		self.copula_entropy = None
-
-
-	def fit(self, z, batch_size=10000, steps_per_epoch=1000):
-		''' '''
-		epochs = 20
-		z_gen = CopulaBatchGenerator(z, batch_size=batch_size, steps_per_epoch=steps_per_epoch)
-		self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \
-			callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()])
-		self.copula_entropy = self.model.evaluate(z_gen)
-
-
 
+from kxy.misc.tf import CopulaLearner
 
 def copula_entropy(z, subsets=[]):
 	'''

diff --git a/kxy/misc/tf/__init__.py b/kxy/misc/tf/__init__.py
@@ -0,0 +1,24 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+"""
+Copyright (C) 2022 KXY TECHNOLOGIES, INC.
+Author: Dr Yves-Laurent Kom Samo
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU Affero General Public License
+along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+from .generators import *
+from .layers import *
+from .losses import *
+from .models import *
+from .learners import *