Skip to content

Commit

Permalink
Making the PFS implementation more flexible and reproducible
Browse files Browse the repository at this point in the history
  • Loading branch information
Yves-Laurent committed Apr 10, 2022
1 parent db1ce85 commit 7e30ca0
Show file tree
Hide file tree
Showing 13 changed files with 182 additions and 40 deletions.
35 changes: 34 additions & 1 deletion CHANGELOG.md
@@ -1,5 +1,38 @@

## Changes

# Change Log
## v.1.4.7 Changes

Changes related to optimizing Principal Feature Selection.

* Made it easy to change PFS' default learning parameters.
* Changed PFS' default learning parameters (learning rate is now 0.005 and epsilon 1e-04)
* Adding a seed parameter to PFS' fit for reproducibility.

To globally change the learning rate to 0.003, change Adam's epsilon to 1e-5, and the number of epochs to 25, do

```Python
from kxy.misc.tf import set_default_parameter
set_default_parameter('lr', 0.003)
set_default_parameter('epsilon', 1e-5)
set_default_parameter('epochs', 25)
```

To change the number epochs for a single iteration of PFS, use the `epochs` argument of the `fit` method of your `PFS` object. The `fit` method now also has a `seed` parameter you may use to make the PFS implementation deterministic.

Example:
```Python
from kxy.pfs import PFS
selector = PFS()
selector.fit(x, y, epochs=25, seed=123)
```

Alternatively, you may also use the `kxy.misc.tf.set_seed` method to make PFS deterministic.


## v.1.4.6 Changes

Minor PFS improvements.

* Adding more (robust) mutual information loss functions.
* Exposing the learned total mutual information between principal features and target as an attribute of PFS.
Expand Down
2 changes: 1 addition & 1 deletion Makefile
@@ -1,4 +1,4 @@
VERSION = 1.4.6
VERSION = 1.4.7

# Update the s3 bucket of the docs website
deploy_docs:
Expand Down
2 changes: 1 addition & 1 deletion docker/kxy/Dockerfile
Expand Up @@ -22,7 +22,7 @@ RUN pip install boto3
RUN pip install tqdm

# Install kxy
RUN pip install kxy==1.4.6
RUN pip install kxy==1.4.7

# Copy examples into the Notebooks folder
RUN git clone https://github.com/kxytechnologies/kxy-python.git /opt/kxy-python
Expand Down
2 changes: 1 addition & 1 deletion kxy/__init__.py
Expand Up @@ -19,7 +19,7 @@
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""
__version__ = "1.4.6"
__version__ = "1.4.7"

from kxy.api import *
from kxy.pre_learning import *
Expand Down
2 changes: 2 additions & 0 deletions kxy/misc/tf/__init__.py
Expand Up @@ -27,6 +27,8 @@

from .generators import *
from .ops import *
from .config import *
from .initializers import *
from .layers import *
from .losses import *
from .models import *
Expand Down
27 changes: 27 additions & 0 deletions kxy/misc/tf/config.py
@@ -0,0 +1,27 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Global default training configs
"""
# LEARNING PARAMETERS
LR = 0.005
EPOCHS = 20

# ADAM PARAMETERS
BETA_1 = 0.9
BETA_2 = 0.999
EPSILON = 1e-04
AMSGRAD = False
BATCH_SIZE = 500


def set_default_parameter(name, value):
'''
Utility function to change parameters above at runtime.
'''
import logging
globals()[name.upper()] = value
return

def get_default_parameter(name):
return eval(name.upper())
20 changes: 14 additions & 6 deletions kxy/misc/tf/generators.py
Expand Up @@ -11,6 +11,12 @@
tf.config.set_soft_device_placement(True)
from tensorflow.keras.utils import Sequence

LOCAL_SEED = None

def set_generators_seed(seed):
globals()['LOCAL_SEED'] = seed


rankdata = lambda x: 1.+np.argsort(np.argsort(x, axis=0), axis=0)
class CopulaBatchGenerator(Sequence):
'''
Expand All @@ -24,17 +30,18 @@ def __init__(self, z, batch_size=1000, steps_per_epoch=100):
self.steps_per_epoch = steps_per_epoch
self.emp_u = rankdata(self.z)/(self.n + 1.)
self.emp_u[np.isnan(self.z)] = 0.5
self.rnd_gen = np.random.default_rng(LOCAL_SEED)

if self.n < 200*self.d:
dn = 200*self.d - self.n
selected_rows = np.random.choice(self.n, dn, replace=True)
selected_rows = self.rnd_gen.choice(self.n, dn, replace=True)
emp_u = self.emp_u[selected_rows, :].copy()
scale = 1./(100.*self.n)
emp_u += (scale*np.random.rand(*emp_u.shape) - 0.5*scale)
emp_u += (scale*self.rnd_gen.uniform(size=emp_u.shape) - 0.5*scale)
self.emp_u = np.concatenate([self.emp_u, emp_u], axis=0)
self.n = self.emp_u.shape[0]

self.batch_selector = np.random.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True)
self.batch_selector = self.rnd_gen.choice(self.n, self.batch_size*self.steps_per_epoch, replace=True)
self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size))


Expand All @@ -44,7 +51,7 @@ def getitem_ndarray(self, idx):
selected_rows = self.batch_selector[i]
emp_u_ = self.emp_u[selected_rows, :]
z_p = emp_u_.copy()
z_q = np.random.rand(*emp_u_.shape)
z_q = self.rnd_gen.uniform(size=emp_u_.shape)

z = np.empty((self.batch_size, self.d, 2))
z[:, :, 0] = z_p
Expand All @@ -70,6 +77,7 @@ class PFSBatchGenerator(Sequence):
Random batch generator.
'''
def __init__(self, x, y, ox=None, oy=None, batch_size=1000, steps_per_epoch=100, n_shuffle=5):
self.rnd_gen = np.random.default_rng(LOCAL_SEED)
assert x.shape[0] == y.shape[0]
self.batch_size = batch_size
self.n_shuffle = n_shuffle
Expand All @@ -89,7 +97,7 @@ def __init__(self, x, y, ox=None, oy=None, batch_size=1000, steps_per_epoch=100,

self.steps_per_epoch = steps_per_epoch
replace = False if self.n > self.batch_size*self.steps_per_epoch else True
self.batch_selector = np.random.choice(self.n, self.batch_size*self.steps_per_epoch, replace=replace)
self.batch_selector = self.rnd_gen.choice(self.n, self.batch_size*self.steps_per_epoch, replace=replace)
self.batch_selector = self.batch_selector.reshape((self.steps_per_epoch, self.batch_size))


Expand All @@ -110,7 +118,7 @@ def getitem_ndarray(self, idx):
z_p = z_.copy() if z_p is None else np.concatenate([z_p, z_.copy()], axis=0)
y_q = y_.copy()
randomize = np.arange(y_q.shape[0])
np.random.shuffle(randomize)
self.rnd_gen.shuffle(randomize)
y_q = y_q[randomize]
if not self.ox is None:
oy_q = oy_.copy()
Expand Down
25 changes: 25 additions & 0 deletions kxy/misc/tf/initializers.py
@@ -0,0 +1,25 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Custom Tensorflow initializers.
"""
import logging

from tensorflow.keras.initializers import GlorotUniform

LOCAL_SEED = None
INITIALIZER_COUNT = 0

def frozen_glorot_uniform():
'''
Deterministic GlorotUniform initializer.
'''
if LOCAL_SEED is not None:
initializer = GlorotUniform(LOCAL_SEED+INITIALIZER_COUNT)
globals()['INITIALIZER_COUNT'] = INITIALIZER_COUNT + 1
return initializer
else:
return GlorotUniform()

def set_initializers_seed(seed):
globals()['LOCAL_SEED'] = seed
56 changes: 46 additions & 10 deletions kxy/misc/tf/learners.py
Expand Up @@ -4,34 +4,50 @@
Tensorflow learners.
"""
import numpy as np
import logging

import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, TerminateOnNaN
from tensorflow.keras.optimizers import Adam

from .generators import CopulaBatchGenerator, PFSBatchGenerator
from .generators import CopulaBatchGenerator, PFSBatchGenerator, set_generators_seed
from .initializers import set_initializers_seed
from .models import CopulaModel, PFSModel, PFSOneShotModel
from .losses import MINDLoss, ApproximateMINDLoss, RectifiedMINDLoss
from .config import get_default_parameter

def set_seed(seed):
set_generators_seed(seed)
set_initializers_seed(seed)


class CopulaLearner(object):
'''
Maximum-entropy learner.
'''
def __init__(self, d, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \
name='Adam', lr=0.01, subsets=[]):
def __init__(self, d, beta_1=None, beta_2=None, epsilon=None, amsgrad=None, \
name='Adam', lr=None, subsets=[]):
self.d = d
self.model = CopulaModel(self.d, subsets=subsets)
beta_1 = get_default_parameter('beta_1') if beta_1 is None else beta_1
beta_2 = get_default_parameter('beta_2') if beta_2 is None else beta_2
lr = get_default_parameter('lr') if lr is None else lr
amsgrad = get_default_parameter('amsgrad') if amsgrad is None else amsgrad
epsilon = get_default_parameter('epsilon') if epsilon is None else epsilon
logging.info('Using the Adam optimizer with learning parameters: ' \
'lr: %.4f, beta_1: %.4f, beta_2: %.4f, epsilon: %.8f, amsgrad: %s' % \
(lr, beta_1, beta_2, epsilon, amsgrad))
self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \
name=name, lr=lr)
self.loss = MINDLoss()
self.model.compile(optimizer=self.opt, loss=self.loss)
self.copula_entropy = None


def fit(self, z, batch_size=10000, steps_per_epoch=1000, epochs=20):
def fit(self, z, batch_size=10000, steps_per_epoch=1000, epochs=None):
''' '''
z_gen = CopulaBatchGenerator(z, batch_size=batch_size, steps_per_epoch=steps_per_epoch)
epochs = get_default_parameter('epochs') if epochs is None else epochs
self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \
callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()])
self.copula_entropy = self.model.evaluate(z_gen)
Expand All @@ -43,14 +59,22 @@ class PFSLearner(object):
'''
Principal Feature Learner.
'''
def __init__(self, dx, dy=1, dox=0, doy=0, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \
name='Adam', lr=0.01):
def __init__(self, dx, dy=1, dox=0, doy=0, beta_1=None, beta_2=None, epsilon=None, amsgrad=None, \
lr=None, name='Adam'):
x_ixs = [_ for _ in range(dx)]
y_ixs = [dx+_ for _ in range(dy)]
ox_ixs = [dx+dy+_ for _ in range(dox)]
oy_ixs = [dx+dy+dox+_ for _ in range(doy)]

self.model = PFSModel(x_ixs, y_ixs, ox_ixs=ox_ixs, oy_ixs=oy_ixs)
beta_1 = get_default_parameter('beta_1') if beta_1 is None else beta_1
beta_2 = get_default_parameter('beta_2') if beta_2 is None else beta_2
lr = get_default_parameter('lr') if lr is None else lr
amsgrad = get_default_parameter('amsgrad') if amsgrad is None else amsgrad
epsilon = get_default_parameter('epsilon') if epsilon is None else epsilon
logging.info('Using the Adam optimizer with learning parameters: ' \
'lr: %.4f, beta_1: %.4f, beta_2: %.4f, epsilon: %.8f, amsgrad: %s' % \
(lr, beta_1, beta_2, epsilon, amsgrad))
self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \
name=name, lr=lr)
self.loss = RectifiedMINDLoss() # MINDLoss()
Expand All @@ -60,14 +84,16 @@ def __init__(self, dx, dy=1, dox=0, doy=0, beta_1=0.9, beta_2=0.999, epsilon=1e-
self.statistics = None


def fit(self, x, y, ox=None, oy=None, batch_size=500, n_shuffle=5, epochs=20, mi_eps=0.00001):
def fit(self, x, y, ox=None, oy=None, batch_size=None, n_shuffle=5, epochs=None, mi_eps=0.00001):
''' '''
n = x.shape[0]
batch_size = get_default_parameter('batch_size') if batch_size is None else batch_size
steps_per_epoch = n//batch_size
steps_per_epoch = min(max(steps_per_epoch, 100), 1000)

z_gen = PFSBatchGenerator(x, y, ox=ox, oy=oy, batch_size=batch_size, \
steps_per_epoch=steps_per_epoch, n_shuffle=n_shuffle)
epochs = get_default_parameter('epochs') if epochs is None else epochs
self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch, \
callbacks=[EarlyStopping(patience=3, monitor='loss'), TerminateOnNaN()])
self.mutual_information = -self.model.evaluate(z_gen)
Expand Down Expand Up @@ -102,12 +128,20 @@ class PFSOneShotLearner(object):
'''
Principal Feature Learner learning multiple principal features simultaneously.
'''
def __init__(self, dx, dy=1, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False, \
name='Adam', lr=0.01, p=1):
def __init__(self, dx, dy=1, beta_1=None, beta_2=None, epsilon=None, amsgrad=None, \
lr=None, name='Adam', p=1):
x_ixs = [_ for _ in range(dx)]
y_ixs = [dx+_ for _ in range(dy)]

self.model = PFSOneShotModel(x_ixs, y_ixs, p=p)
beta_1 = get_default_parameter('beta_1') if beta_1 is None else beta_1
beta_2 = get_default_parameter('beta_2') if beta_2 is None else beta_2
lr = get_default_parameter('lr') if lr is None else lr
amsgrad = get_default_parameter('amsgrad') if amsgrad is None else amsgrad
epsilon = get_default_parameter('epsilon') if epsilon is None else epsilon
logging.info('Using the Adam optimizer with learning parameters: ' \
'lr: %.4f, beta_1: %.4f, beta_2: %.4f, epsilon: %.8f, amsgrad: %s' % \
(lr, beta_1, beta_2, epsilon, amsgrad))
self.opt = Adam(beta_1=beta_1, beta_2=beta_2, epsilon=epsilon, amsgrad=amsgrad, \
name=name, lr=lr)
self.loss = RectifiedMINDLoss() # MINDLoss()
Expand All @@ -117,14 +151,16 @@ def __init__(self, dx, dy=1, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=Fa
self.statistics = None


def fit(self, x, y, batch_size=500, n_shuffle=5, epochs=20, mi_eps=0.00001):
def fit(self, x, y, batch_size=None, n_shuffle=5, epochs=None, mi_eps=0.00001):
''' '''
n = x.shape[0]
batch_size = get_default_parameter('batch_size') if batch_size is None else batch_size
steps_per_epoch = n//batch_size
steps_per_epoch = min(max(steps_per_epoch, 100), 1000)

z_gen = PFSBatchGenerator(x, y, batch_size=batch_size, \
steps_per_epoch=steps_per_epoch, n_shuffle=n_shuffle)
epochs = get_default_parameter('epochs') if epochs is None else epochs
self.model.fit(z_gen, epochs=epochs, batch_size=batch_size, steps_per_epoch=steps_per_epoch)
self.mutual_information = -self.model.evaluate(z_gen)
w = self.model.w_layer.get_weights()[0]
Expand Down

0 comments on commit 7e30ca0

Please sign in to comment.