Adding kernel density estimate based entropy estimation for scalar ra…

…ndom variables
kxytechnologies · May 18, 2020 · f60ccee · f60ccee
1 parent b7432b0
commit f60ccee
Show file tree

Hide file tree

Showing 5 changed files with 48 additions and 33 deletions.
diff --git a/docs/images/bn_importance.png b/docs/images/bn_importance.png
diff --git a/docs/latest/introduction/getting_started/index.rst b/docs/latest/introduction/getting_started/index.rst
@@ -69,7 +69,7 @@ inputs (irrespective of the classification model)?
 .. code-block:: python
 
 	>>> df.kxy.classification_feasibility('Is Fake')
-	3.58
+	2.52
 
 The higher the number is relative to the entropy of the responsee, the better. A value of 0
 means no model can successfully solve this classification problem using provided inputs, no
@@ -95,17 +95,17 @@ and reduces time wasted improving models fitted on irrelevant inputs.
 	>>> importance_df_1 = df.kxy.individual_input_importance('Is Fake')
 	>>> importance_df_1
 	       input  individual_importance  normalized_individual_importance  cum_normalized_individual_importance
-	0  Variance                   1.89                              0.84                                  0.84
-	1  Skewness                   0.37                              0.16                                  1.00
-	2  Kurtosis                   0.00                              0.00                                  1.00
-	3   Entropy                   0.00                              0.00                                  1.00
+	0  Variance                   1.33                              0.67                                  0.67
+	1  Skewness                   0.49                              0.25                                  0.92
+	2  Kurtosis                   0.15                              0.08                                  1.00
+	3   Entropy                   0.01                              0.00                                  1.00
 	>>> importance_df_2 = df.kxy.incremental_input_importance('Is Fake')
 	>>> importance_df_2
 	       input  selection_order  incremental_importance  normalized_incremental_importance  cum_normalized_incremental_importance
-	0  Variance                1                    1.89                               0.71                                   0.71
-	1  Skewness                2                    0.32                               0.12                                   0.83
-	2  Kurtosis                3                    0.44                               0.17                                   1.00
-	3   Entropy                4                    0.00                               0.00                                   1.00
+	0  Variance                1                    1.33                               0.53                                   0.53
+	1  Skewness                2                    0.53                               0.21                                   0.74
+	2  Kurtosis                3                    0.45                               0.18                                   0.92
+	3   Entropy                4                    0.21                               0.08                                   1.00
 	>>> importance_df_1 = importance_df_1.set_index(['input'])
 	>>> importance_df_2 = importance_df_2.set_index(['input'])
 	>>> importance_df = pd.concat([importance_df_1, importance_df_2], axis=1)
@@ -171,7 +171,7 @@ Back to our bank note example, given how high an out-of-sample accuracy we got,
 	... 	discrete_input_columns=(), continuous_input_columns=())
 	0.00
 	>>> train_df.kxy.classification_feasibility('Is Fake')
-	2.54
+	1.95
 
 As it turns out, a simple logistic regression allows us to extract nearly all of the intrinsic value there is in using the 3 inputs above to determmine whether a bank note is fake. Thus, using a nonlinear model might not yield the highest ROI. 
 

diff --git a/kxy/api/core/entropy.py b/kxy/api/core/entropy.py
@@ -7,20 +7,38 @@
 
 import numpy as np
 import scipy.special as spe
+from scipy import stats
 
 from kxy.api import APIClient, solve_copula_sync
-from .utils import pre_conditioner, spearman_corr, pearson_corr
+from .utils import spearman_corr, pearson_corr
 
 
-def scalar_continuous_entropy(x, space='dual'):
+def scalar_continuous_entropy(x, space='dual', method='gaussian-kde'):
 	"""
 	.. _scalar-continuous-entropy:
-	Estimates the (differential) entropy of a continuous scalar random variable using the standard 1-spacing estimator (see [2]_ and [3]_):
+	Estimates the (differential) entropy of a continuous scalar random variable.
+
+	Multiple methods are supported:
+
+	* Gaussian moment matching
 
 	.. math::
-		h(x) \\approx - \gamma(1) + \\frac{1}{n-1} \\sum_{i=1}^{n-1} \log \\left[ n \\left(x_{(i+1)} - x_{(i)} \\right) \\right],
+		h(x) = \\frac{1}{2} \\log\\left(2 \\pi e \\sigma^2 \\right)
+
+	* The standard 1-spacing estimator (see [2]_ and [3]_):
+
+	.. math::
+		h(x) \\approx - \\gamma(1) + \\frac{1}{n-1} \\sum_{i=1}^{n-1} \\log \\left[ n \\left(x_{(i+1)} - x_{(i)} \\right) \\right],
 
 	where :math:`x_{(i)}` is the i-th smallest sample, and :math:`\\gamma` is `the digamma function. <https://en.wikipedia.org/wiki/Digamma_function>`_
+	
+	* Gaussian kernel density estimation.
+
+	.. math::
+		h(x) \\approx \\frac{1}{n} \\sum_{i=1}^n \\log\\left( \\hat{p}\\left(x_i\\right) \\right)
+
+	where :math:`\\hat{p}` is the Gaussian kernel density estimator of the true pdf using :code:`scipy.stats.gaussian_kde`.
+
 
 
 	
@@ -49,14 +67,21 @@ def scalar_continuous_entropy(x, space='dual'):
 		International Journal of Mathematical and Statistical Sciences. 6 (1): 17–40. (1997) ISSN 1055-7490. 
 	"""
 	assert len(x.shape) == 1 or x.shape[1] == 1, 'x should be a one dimensional numpy array'
-	if space == 'primal':
+	assert method in ('gaussian', '1-spacing', 'gaussian-kde')
+
+	if space == 'primal' or method == 'gaussian':
 		return 0.5*np.log(2.*np.pi*np.e*np.var(x))
 
-	sorted_x = np.unique(x)
-	n = sorted_x.shape[0]
-	ent = np.sum(np.log(n*(sorted_x[1:]-sorted_x[:-1])))/n - spe.digamma(1)
+	if method == '1-spacing':
+		sorted_x = np.unique(x)
+		n = sorted_x.shape[0]
+		ent = np.sum(np.log(n*(sorted_x[1:]-sorted_x[:-1])))/n - spe.digamma(1)
+		return ent
+
+	if method == 'gaussian-kde':
+	    kern = stats.gaussian_kde(x)
+	    return -kern.logpdf(x).mean()
 
-	return ent
 
 
 def discrete_entropy(x):
@@ -161,15 +186,10 @@ def least_structured_continuous_entropy(x, space='dual'):
 	if len(x.shape) == 1 or x.shape[1] == 1:
 		return scalar_continuous_entropy(x, space=space)
 
+	ch = least_structured_copula_entropy(x, space=space)
+	ih = np.sum([scalar_continuous_entropy(x[:, i], space=space) for i in range(x.shape[1])])
 
-	x = x - x.mean(axis=0)
-	a, log_abs_det_a = pre_conditioner(x)
-	z = np.dot(a, x.T).T
-	ch = least_structured_copula_entropy(z, space=space)
-	ih = np.sum([scalar_continuous_entropy(z[:, i], space=space) for i in range(z.shape[1])])
-
-	return ih+ch-log_abs_det_a
-
+	return ih+ch
 
 
 

diff --git a/kxy/classification/pre_learning.py b/kxy/classification/pre_learning.py
@@ -123,11 +123,6 @@ def classification_input_incremental_importance(x_c, y, z_c, x_d=None, z_d=None,
 
 	x_ = np.reshape(x_c, (len(x_c), 1)) if len(x_c.shape) == 1 else x_c.copy()
 	z_ = np.reshape(z_c, (len(z_c), 1)) if len(z_c.shape) == 1 else z_c.copy()
-
-	# cmi = least_mixed_conditional_mutual_information(\
-	# 	np.hstack((x_, np.abs(x_-x_.mean(axis=0)))), y, \
-	# 	np.hstack((z_, np.abs(z_-z_.mean(axis=0)))), x_d=x_d, z_d=z_d, space=space)
-
 	cmi = least_mixed_conditional_mutual_information(x_, y, z_, x_d=x_d, z_d=z_d, space=space, non_monotonic_extension=True)
 
 	return cmi

diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 import sys
 sys.path.append('.')
 from setuptools import setup, find_packages
-version = "0.0.14"
+version = "0.0.15"
 
 setup(name="kxy",
 	version=version,