Skip to content

Commit

Permalink
Adding kernel density estimate based entropy estimation for scalar ra…
Browse files Browse the repository at this point in the history
…ndom variables
  • Loading branch information
drylks committed May 18, 2020
1 parent b7432b0 commit f60ccee
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 33 deletions.
Binary file modified docs/images/bn_importance.png
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
20 changes: 10 additions & 10 deletions docs/latest/introduction/getting_started/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ inputs (irrespective of the classification model)?
.. code-block:: python
>>> df.kxy.classification_feasibility('Is Fake')
3.58
2.52
The higher the number is relative to the entropy of the responsee, the better. A value of 0
means no model can successfully solve this classification problem using provided inputs, no
Expand All @@ -95,17 +95,17 @@ and reduces time wasted improving models fitted on irrelevant inputs.
>>> importance_df_1 = df.kxy.individual_input_importance('Is Fake')
>>> importance_df_1
input individual_importance normalized_individual_importance cum_normalized_individual_importance
0 Variance 1.89 0.84 0.84
1 Skewness 0.37 0.16 1.00
2 Kurtosis 0.00 0.00 1.00
3 Entropy 0.00 0.00 1.00
0 Variance 1.33 0.67 0.67
1 Skewness 0.49 0.25 0.92
2 Kurtosis 0.15 0.08 1.00
3 Entropy 0.01 0.00 1.00
>>> importance_df_2 = df.kxy.incremental_input_importance('Is Fake')
>>> importance_df_2
input selection_order incremental_importance normalized_incremental_importance cum_normalized_incremental_importance
0 Variance 1 1.89 0.71 0.71
1 Skewness 2 0.32 0.12 0.83
2 Kurtosis 3 0.44 0.17 1.00
3 Entropy 4 0.00 0.00 1.00
0 Variance 1 1.33 0.53 0.53
1 Skewness 2 0.53 0.21 0.74
2 Kurtosis 3 0.45 0.18 0.92
3 Entropy 4 0.21 0.08 1.00
>>> importance_df_1 = importance_df_1.set_index(['input'])
>>> importance_df_2 = importance_df_2.set_index(['input'])
>>> importance_df = pd.concat([importance_df_1, importance_df_2], axis=1)
Expand Down Expand Up @@ -171,7 +171,7 @@ Back to our bank note example, given how high an out-of-sample accuracy we got,
... discrete_input_columns=(), continuous_input_columns=())
0.00
>>> train_df.kxy.classification_feasibility('Is Fake')
2.54
1.95
As it turns out, a simple logistic regression allows us to extract nearly all of the intrinsic value there is in using the 3 inputs above to determmine whether a bank note is fake. Thus, using a nonlinear model might not yield the highest ROI.

Expand Down
54 changes: 37 additions & 17 deletions kxy/api/core/entropy.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,38 @@

import numpy as np
import scipy.special as spe
from scipy import stats

from kxy.api import APIClient, solve_copula_sync
from .utils import pre_conditioner, spearman_corr, pearson_corr
from .utils import spearman_corr, pearson_corr


def scalar_continuous_entropy(x, space='dual'):
def scalar_continuous_entropy(x, space='dual', method='gaussian-kde'):
"""
.. _scalar-continuous-entropy:
Estimates the (differential) entropy of a continuous scalar random variable using the standard 1-spacing estimator (see [2]_ and [3]_):
Estimates the (differential) entropy of a continuous scalar random variable.
Multiple methods are supported:
* Gaussian moment matching
.. math::
h(x) \\approx - \gamma(1) + \\frac{1}{n-1} \\sum_{i=1}^{n-1} \log \\left[ n \\left(x_{(i+1)} - x_{(i)} \\right) \\right],
h(x) = \\frac{1}{2} \\log\\left(2 \\pi e \\sigma^2 \\right)
* The standard 1-spacing estimator (see [2]_ and [3]_):
.. math::
h(x) \\approx - \\gamma(1) + \\frac{1}{n-1} \\sum_{i=1}^{n-1} \\log \\left[ n \\left(x_{(i+1)} - x_{(i)} \\right) \\right],
where :math:`x_{(i)}` is the i-th smallest sample, and :math:`\\gamma` is `the digamma function. <https://en.wikipedia.org/wiki/Digamma_function>`_
* Gaussian kernel density estimation.
.. math::
h(x) \\approx \\frac{1}{n} \\sum_{i=1}^n \\log\\left( \\hat{p}\\left(x_i\\right) \\right)
where :math:`\\hat{p}` is the Gaussian kernel density estimator of the true pdf using :code:`scipy.stats.gaussian_kde`.
Expand Down Expand Up @@ -49,14 +67,21 @@ def scalar_continuous_entropy(x, space='dual'):
International Journal of Mathematical and Statistical Sciences. 6 (1): 17–40. (1997) ISSN 1055-7490.
"""
assert len(x.shape) == 1 or x.shape[1] == 1, 'x should be a one dimensional numpy array'
if space == 'primal':
assert method in ('gaussian', '1-spacing', 'gaussian-kde')

if space == 'primal' or method == 'gaussian':
return 0.5*np.log(2.*np.pi*np.e*np.var(x))

sorted_x = np.unique(x)
n = sorted_x.shape[0]
ent = np.sum(np.log(n*(sorted_x[1:]-sorted_x[:-1])))/n - spe.digamma(1)
if method == '1-spacing':
sorted_x = np.unique(x)
n = sorted_x.shape[0]
ent = np.sum(np.log(n*(sorted_x[1:]-sorted_x[:-1])))/n - spe.digamma(1)
return ent

if method == 'gaussian-kde':
kern = stats.gaussian_kde(x)
return -kern.logpdf(x).mean()

return ent


def discrete_entropy(x):
Expand Down Expand Up @@ -161,15 +186,10 @@ def least_structured_continuous_entropy(x, space='dual'):
if len(x.shape) == 1 or x.shape[1] == 1:
return scalar_continuous_entropy(x, space=space)

ch = least_structured_copula_entropy(x, space=space)
ih = np.sum([scalar_continuous_entropy(x[:, i], space=space) for i in range(x.shape[1])])

x = x - x.mean(axis=0)
a, log_abs_det_a = pre_conditioner(x)
z = np.dot(a, x.T).T
ch = least_structured_copula_entropy(z, space=space)
ih = np.sum([scalar_continuous_entropy(z[:, i], space=space) for i in range(z.shape[1])])

return ih+ch-log_abs_det_a

return ih+ch



Expand Down
5 changes: 0 additions & 5 deletions kxy/classification/pre_learning.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,11 +123,6 @@ def classification_input_incremental_importance(x_c, y, z_c, x_d=None, z_d=None,

x_ = np.reshape(x_c, (len(x_c), 1)) if len(x_c.shape) == 1 else x_c.copy()
z_ = np.reshape(z_c, (len(z_c), 1)) if len(z_c.shape) == 1 else z_c.copy()

# cmi = least_mixed_conditional_mutual_information(\
# np.hstack((x_, np.abs(x_-x_.mean(axis=0)))), y, \
# np.hstack((z_, np.abs(z_-z_.mean(axis=0)))), x_d=x_d, z_d=z_d, space=space)

cmi = least_mixed_conditional_mutual_information(x_, y, z_, x_d=x_d, z_d=z_d, space=space, non_monotonic_extension=True)

return cmi
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import sys
sys.path.append('.')
from setuptools import setup, find_packages
version = "0.0.14"
version = "0.0.15"

setup(name="kxy",
version=version,
Expand Down

0 comments on commit f60ccee

Please sign in to comment.