Skip to content

Commit

Permalink
[eda] Added target_analysis - target variable composite analysis (#2675)
Browse files Browse the repository at this point in the history
- Added distribution_fit for analyze_interaction shortcut
- Added target_analysis composite analysis
- Added distribution plots for covariate_shift_detection when shift is detected
- dynamic figargs setting for various components; adjusted fontsize and colors
- added parameters names for distribution fit analysis
  • Loading branch information
gradientsky committed Jan 11, 2023
1 parent 7bb19c0 commit 6dd8026
Show file tree
Hide file tree
Showing 13 changed files with 600 additions and 64 deletions.
41 changes: 37 additions & 4 deletions eda/src/autogluon/eda/analysis/interaction.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

__all__ = ["Correlation", "CorrelationSignificance", "FeatureInteraction", "DistributionFit"]

from autogluon.common.features.types import R_FLOAT, R_INT


class Correlation(AbstractAnalysis):
"""
Expand Down Expand Up @@ -95,6 +97,10 @@ def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None:
state.correlations = {}
state.correlations_method = self.method
for (ds, df) in self.available_datasets(args):

if args.label in df.columns and df[args.label].dtype not in [R_INT, R_FLOAT]:
df[args.label] = df[args.label].astype("category").cat.codes

if self.method == "phik":
state.correlations[ds] = df.phik_matrix(**self.args, verbose=False)
else:
Expand Down Expand Up @@ -336,7 +342,16 @@ class DistributionFit(AbstractAnalysis):
"""

# Getting the list of distributions: https://docs.scipy.org/doc/scipy/tutorial/stats.html#getting-help
AVAILABLE_DISTRIBUTIONS = sorted([d for d in dir(stats) if isinstance(getattr(stats, d), stats.rv_continuous)])
AVAILABLE_DISTRIBUTIONS = sorted(
[
dist
for dist in dir(stats)
if isinstance(getattr(stats, dist), stats.rv_continuous)
# kstwo can't be fit on a single variable
# levy_stable, studentized_range are too slow
and dist not in ["kstwo", "levy_stable", "studentized_range"]
]
)

def __init__(
self,
Expand Down Expand Up @@ -377,6 +392,7 @@ def can_handle(self, state: AnalysisState, args: AnalysisState) -> bool:

def _fit(self, state: AnalysisState, args: AnalysisState, **fit_kwargs) -> None:
state.distributions_fit = {}
state.distributions_fit_pvalue_min = self.pvalue_min
for (ds, df) in self.available_datasets(args):
state.distributions_fit[ds] = {}
for c in self.columns:
Expand All @@ -401,20 +417,37 @@ def _fit_dist(self, series, pvalue_min=0.01):
if pvalue >= pvalue_min:
results[i] = {
"param": param,
"shapes": self._list_parameters(dist),
"statistic": statistic,
"pvalue": pvalue,
}
if len(results) == 0:
self.logger.warning(
f"{series.name}: none of the distributions were able to fit to satisfy specified pvalue_min: {self.pvalue_min}"
)
return None
df = pd.DataFrame(results).T.sort_values("pvalue", ascending=False)
if self.keep_top_n is not None:
df = df[: self.keep_top_n]
results = df.T.to_dict()
return results

def _list_parameters(self, distribution):
"""List parameters for scipy.stats.distribution.
# Arguments
distribution: a string or scipy.stats distribution object.
# Returns
A list of distribution parameter strings.
"""
if isinstance(distribution, str):
distribution = getattr(stats, distribution)
if distribution.shapes:
parameters = [name.strip() for name in distribution.shapes.split(",")]
else:
parameters = []
if distribution.name in stats._discrete_distns._distn_names:
parameters += ["loc"]
elif distribution.name in stats._continuous_distns._distn_names:
parameters += ["loc", "scale"]
return parameters


class FeatureDistanceAnalysis(AbstractAnalysis):
"""
Expand Down
9 changes: 8 additions & 1 deletion eda/src/autogluon/eda/auto/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,8 @@
from .simple import analyze, analyze_interaction, covariate_shift_detection, dataset_overview, quick_fit
from .simple import (
analyze,
analyze_interaction,
covariate_shift_detection,
dataset_overview,
quick_fit,
target_analysis,
)

0 comments on commit 6dd8026

Please sign in to comment.