-
-
Notifications
You must be signed in to change notification settings - Fork 25k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[MRG] FEA Gower distance #16834
base: main
Are you sure you want to change the base?
[MRG] FEA Gower distance #16834
Changes from 210 commits
e5fdbbb
dcf96f4
41a2748
d3221a7
da71fba
a63c43f
e50d9d9
47b20a9
12b773b
a32f8e7
3480bf2
7be14ba
3d1f2bc
181a750
b8da4c9
e31f72b
5096b76
1230b6f
16b756f
db6303b
7cb7ce9
9679345
066d9fa
348bf40
1b6f8b6
89b8884
ab8a61d
ef90d8e
dd1fdcd
71ce0c5
705fec9
9e5a2ac
1ed4550
a3a3135
ecb50be
57693b1
9fd98c7
ed2ce90
6708e0d
2ca1fa4
fb21d0f
8cc70af
a7654e4
52bb273
fcc9519
1df1fea
0339b55
19f1c57
dcf3a37
2b1a697
2cb2802
84dfcf1
d798f06
4889385
1cd6979
206cd26
43b77ef
6d847d4
9379e2c
4bf77e7
992b5cb
dbc6f55
da825de
3090915
4d10175
460484f
8b7f236
c699f8d
ddf9022
b3ad764
b99aacc
8f1bcd3
cda1b54
6b10f24
5209834
cc0184f
ceb4b44
172d21f
f3b1544
8ed3ecb
0c4d489
6c1054e
ca12d35
9c09d9b
ed74af0
f40273f
7dd2a9b
c5a4472
3a9f576
d3b10fe
fcb4763
6f2d98d
5c6c30d
745de05
6fa6e88
077b3cb
ded653a
eb1ee32
782eb3d
4c03f5c
3ca56d5
29d82d5
16d339d
6ea57ac
16b9377
1474df8
49a5ac2
c92d47d
67491ce
e123d36
a993bbe
5e4cf76
66650fa
23966ff
ae7f556
d257bba
52ad60f
336c183
c4959fa
faa404f
4a2d89e
5b84803
cc58403
f69fd04
098bef9
bab9ca0
bba8828
26779a0
87e2f63
ff6366b
0931f81
fa44c39
4a46ae1
38d99d5
a93efa5
512428d
6a403d5
29cd45e
a811d57
4d6d584
dbd4af5
091a7fa
117cca0
34e78ae
0a802c3
6df57c2
e610965
1cddfdf
850caa6
545e496
6b438cf
b9d2188
df73f9e
bc08577
a73852e
127bc7b
8cd9ca3
e8c6624
a339e48
27b7fd9
8e37937
d11d8e7
c375fee
462c6f3
8f4e9de
58770f0
b3270a8
188e0ca
eb1ab6b
eab56c4
5f3421e
9317415
d16f833
b23fc65
da6b46d
d84be70
e67579d
e5167e0
7de895b
c88cf0f
08e692a
19e4f0b
7d480b2
14d0d8b
3b3bb54
82707d2
e187e01
7370840
a86ba38
c0f3ee2
d1a116f
8ddfb1b
b37f750
88f835d
77d925f
72bc1dc
984a6a0
cf861bd
1510744
988028a
8454f97
f1d840d
a8f2a65
37359f0
63c179e
8786f5d
c1f3599
fa281c3
c8e840d
4859b81
6ace7da
3232262
89fae67
2ff6e0a
ab8a1f8
d8c2fb5
3d6cd99
08177af
58f4b81
0312f1e
80e71a0
b0e3b11
c0502b9
adb7854
7287b1a
f31ca7b
330f57a
aa8758d
0e8feb8
9635f76
3708a67
9262412
d8b445e
3d433fc
7b6278c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -21,16 +21,20 @@ | |||||
|
||||||
from ..utils.validation import _num_samples | ||||||
from ..utils.validation import check_non_negative | ||||||
from ..utils.validation import check_consistent_length | ||||||
from ..utils import check_array | ||||||
from ..utils import gen_even_slices | ||||||
from ..utils import gen_batches, get_chunk_n_rows | ||||||
from ..utils import is_scalar_nan | ||||||
from ..utils import _safe_indexing | ||||||
from ..utils.extmath import row_norms, safe_sparse_dot | ||||||
from ..preprocessing import normalize | ||||||
from ..utils._mask import _get_mask | ||||||
|
||||||
from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan | ||||||
from ..exceptions import DataConversionWarning | ||||||
from ..utils.fixes import _object_dtype_isnan | ||||||
from ..preprocessing import MinMaxScaler | ||||||
|
||||||
|
||||||
# Utility Functions | ||||||
|
@@ -544,7 +548,7 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean", | |||||
Valid values for metric are: | ||||||
|
||||||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', | ||||||
'manhattan'] | ||||||
'manhattan', 'gower'] | ||||||
|
||||||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', | ||||||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', | ||||||
|
@@ -632,7 +636,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean", | |||||
Valid values for metric are: | ||||||
|
||||||
- from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2', | ||||||
'manhattan'] | ||||||
'manhattan', 'gower'] | ||||||
|
||||||
- from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev', | ||||||
'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski', | ||||||
|
@@ -829,6 +833,177 @@ def cosine_distances(X, Y=None): | |||||
return S | ||||||
|
||||||
|
||||||
def _split_categorical_numerical(X, categorical_features): | ||||||
# the following bit is done before check_pairwise_array to avoid converting | ||||||
# numerical data to object dtype. First we split the data into categorical | ||||||
# and numerical, then we do check_array | ||||||
|
||||||
# TODO: this should be more like check_array(..., accept_pandas=True) | ||||||
if (X is not None and not hasattr(X, 'iloc') | ||||||
and not hasattr(X, '__array__')): | ||||||
X = check_array(X, dtype=np.object, force_all_finite=False) | ||||||
|
||||||
if callable(categorical_features) and X is not None: | ||||||
cols = categorical_features(X) | ||||||
else: | ||||||
cols = categorical_features | ||||||
if cols is None: | ||||||
cols = [] | ||||||
|
||||||
if X is not None: | ||||||
X_cat = _safe_indexing(X, cols, axis=1) | ||||||
X_num = _safe_indexing(X, cols, axis=1, inverse=True) | ||||||
else: | ||||||
X_cat = X_num = None | ||||||
return X_cat, X_num | ||||||
|
||||||
|
||||||
def gower_distances(X, Y=None, categorical_features=None, scale=True, | ||||||
min_values=None, scale_factor=None): | ||||||
"""Compute the distances between the observations in X and Y, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. PEP257: this should be a one-line summary |
||||||
that may contain mixed types of data, using an implementation | ||||||
of Gower formula. | ||||||
|
||||||
Parameters | ||||||
---------- | ||||||
X : {array-like, pandas.DataFrame} of shape (n_samples, n_features) | ||||||
|
||||||
Y : {array-like, pandas.DataFrame} of shape (n_samples, n_features), \ | ||||||
default=None | ||||||
|
||||||
categorical_features : array-like of str, array-like of int, \ | ||||||
array-like of bool, slice or callable, default=None | ||||||
Indexes the data on its second axis. Integers are interpreted as | ||||||
positional columns, while strings can reference DataFrame columns | ||||||
by name. | ||||||
A callable is passed the input data `X` and can return any of the | ||||||
above. To select multiple columns by name or dtype, you can use | ||||||
:obj:`make_column_selector`. | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
By default all non-numeric columns are considered categorical. | ||||||
|
||||||
scale : bool, default=True | ||||||
Indicates if the numerical columns will be scaled between 0 and 1. | ||||||
If false, it is assumed the numerical columns are already scaled. | ||||||
The scaling factors, _i.e._ min and max, are taken from both ``X`` and | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
``Y``. | ||||||
|
||||||
min_values : ndarray of shape (n_features,), default=None | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Per feature adjustment for minimum. Equivalent to | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does "Equivalent to" apply to the case where |
||||||
``min_values - X.min(axis=0) * scale_factor`` | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is it a function of itself? |
||||||
If provided, ``scale_factor`` should be provided as well. | ||||||
|
||||||
scale_factor : ndarray of shape (n_features,), default=None | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
Per feature relative scaling of the data. Equivalent to | ||||||
``(max_values - min_values) / (X.max(axis=0) - X.min(axis=0))`` | ||||||
If provided, ``min_values`` should be provided as well. | ||||||
|
||||||
Returns | ||||||
------- | ||||||
distances : ndarray of shape (n_samples_X, n_samples_Y) | ||||||
|
||||||
References | ||||||
---------- | ||||||
Gower, J.C., 1971, A General Coefficient of Similarity and Some of Its | ||||||
Properties. | ||||||
|
||||||
Notes | ||||||
----- | ||||||
Categorical ordinal attributes should be treated as numeric for the purpose | ||||||
of Gower similarity. | ||||||
|
||||||
Current implementation does not support sparse matrices. | ||||||
|
||||||
All the non-numerical types (e.g., str), are treated as categorical | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
features. | ||||||
|
||||||
This implementation modifies the Gower's original similarity measure in | ||||||
the folowing aspects: | ||||||
|
||||||
* The values in the original similarity S range between 0 and 1. To | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
guarantee this, it is assumed the numerical features of X and Y are | ||||||
scaled between 0 and 1. | ||||||
|
||||||
* Different from the original similarity S, this implementation | ||||||
returns 1-S. | ||||||
""" | ||||||
def _n_cols(X): | ||||||
# TODO: improve this, and add it to validation.py? | ||||||
if hasattr(X, 'shape'): | ||||||
return X.shape[1] | ||||||
return np.asarray(X).shape[1] | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
def _nanmanhatan(x, y): | ||||||
return np.nansum(np.abs(x - y)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is using this row-wise function call worthwhile relative to something more vectorised? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's kinda easy to do more vectorized when X and Y are of the same size, otherwise I'm not sure if it's worth the complexity. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, this implementation is significantly faster than the cosine distances for instance. So I don't think we should worry about the speed too much? |
||||||
|
||||||
def _non_nans(x, y): | ||||||
return np.sum(~_object_dtype_isnan(x) & ~_object_dtype_isnan(y)) | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
def _nanhamming(x, y): | ||||||
return np.sum(x != y) - np.sum( | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doesn't seem to help the time at least when I test it. |
||||||
_object_dtype_isnan(x) | _object_dtype_isnan(y)) | ||||||
|
||||||
if issparse(X) or issparse(Y): | ||||||
raise TypeError("Gower distance does not support sparse matrices") | ||||||
|
||||||
if X is None or len(X) == 0: | ||||||
raise ValueError("X can not be None or empty") | ||||||
|
||||||
if scale: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder whether it's worth running |
||||||
if (scale_factor is None) != (min_values is None): | ||||||
raise ValueError("min_value and scale_factor should be provided " | ||||||
"together.") | ||||||
X_cat, X_num = _split_categorical_numerical(X, categorical_features) | ||||||
Y_cat, Y_num = _split_categorical_numerical(Y, categorical_features) | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
if min_values is not None: | ||||||
min_values = np.asarray(min_values) | ||||||
scale_factor = np.asarray(scale_factor) | ||||||
check_consistent_length(min_values, scale_factor, | ||||||
np.ndarray(shape=(_n_cols(X_num), 0))) | ||||||
|
||||||
if _n_cols(X_num): | ||||||
X_num, Y_num = check_pairwise_arrays(X_num, Y_num, precomputed=False, | ||||||
dtype=float, | ||||||
force_all_finite=False) | ||||||
if scale: | ||||||
scale_data = X_num if Y_num is X_num else np.vstack((X_num, Y_num)) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. the |
||||||
if scale_factor is None: | ||||||
trs = MinMaxScaler().fit(scale_data) | ||||||
else: | ||||||
trs = MinMaxScaler() | ||||||
trs.scale_ = scale_factor | ||||||
trs.min_ = min_values | ||||||
X_num = trs.transform(X_num) | ||||||
Y_num = trs.transform(Y_num) | ||||||
|
||||||
nan_manhatan = distance.cdist(X_num, Y_num, _nanmanhatan) | ||||||
valid_num = distance.cdist(X_num, Y_num, _non_nans) | ||||||
else: | ||||||
nan_manhatan = valid_num = None | ||||||
|
||||||
if _n_cols(X_cat): | ||||||
X_cat, Y_cat = check_pairwise_arrays(X_cat, Y_cat, precomputed=False, | ||||||
dtype=np.object, | ||||||
force_all_finite=False) | ||||||
nan_hamming = distance.cdist(X_cat, Y_cat, _nanhamming) | ||||||
valid_cat = distance.cdist(X_cat, Y_cat, _non_nans) | ||||||
else: | ||||||
nan_hamming = valid_cat = None | ||||||
|
||||||
# based on whether there are categorical and/or numerical data present, | ||||||
# we compute the distance metric | ||||||
# Division by zero and nans warnings are ignored since they are expected | ||||||
with np.errstate(divide='ignore', invalid='ignore'): | ||||||
if valid_num is not None and valid_cat is not None: | ||||||
D = (nan_manhatan + nan_hamming) / (valid_num + valid_cat) | ||||||
elif valid_num is not None: | ||||||
D = nan_manhatan / valid_num | ||||||
else: | ||||||
D = nan_hamming / valid_cat | ||||||
return D | ||||||
|
||||||
|
||||||
# Paired distances | ||||||
def paired_euclidean_distances(X, Y): | ||||||
""" | ||||||
|
@@ -905,7 +1080,7 @@ def paired_cosine_distances(X, Y): | |||||
'l2': paired_euclidean_distances, | ||||||
'l1': paired_manhattan_distances, | ||||||
'manhattan': paired_manhattan_distances, | ||||||
'cityblock': paired_manhattan_distances} | ||||||
'cityblock': paired_manhattan_distances, } | ||||||
|
||||||
|
||||||
def paired_distances(X, Y, metric="euclidean", **kwds): | ||||||
|
@@ -1298,6 +1473,7 @@ def chi2_kernel(X, Y=None, gamma=1.): | |||||
'l2': euclidean_distances, | ||||||
'l1': manhattan_distances, | ||||||
'manhattan': manhattan_distances, | ||||||
'gower': gower_distances, | ||||||
'precomputed': None, # HACK: precomputed is always allowed, never called | ||||||
'nan_euclidean': nan_euclidean_distances, | ||||||
} | ||||||
|
@@ -1322,6 +1498,7 @@ def distance_metrics(): | |||||
'l1' metrics.pairwise.manhattan_distances | ||||||
'l2' metrics.pairwise.euclidean_distances | ||||||
'manhattan' metrics.pairwise.manhattan_distances | ||||||
'gower' metrics.pairwise.gower_distances | ||||||
'nan_euclidean' metrics.pairwise.nan_euclidean_distances | ||||||
=============== ======================================== | ||||||
|
||||||
|
@@ -1400,7 +1577,7 @@ def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds): | |||||
'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', | ||||||
'russellrao', 'seuclidean', 'sokalmichener', | ||||||
'sokalsneath', 'sqeuclidean', 'yule', "wminkowski", | ||||||
'nan_euclidean', 'haversine'] | ||||||
'nan_euclidean', 'haversine', 'gower'] | ||||||
|
||||||
_NAN_METRICS = ['nan_euclidean'] | ||||||
|
||||||
|
@@ -1429,6 +1606,28 @@ def _check_chunk_size(reduced, chunk_size): | |||||
def _precompute_metric_params(X, Y, metric=None, **kwds): | ||||||
"""Precompute data-derived metric parameters if not provided | ||||||
""" | ||||||
if metric == 'gower': | ||||||
categorical_features = kwds.get('categorical_features', None) | ||||||
|
||||||
_, X_num = _split_categorical_numerical(X, categorical_features) | ||||||
_, Y_num = _split_categorical_numerical(Y, categorical_features) | ||||||
|
||||||
scale = kwds.get('scale', True) | ||||||
if not scale: | ||||||
return {'min_values': None, 'scale_factor': None, 'scale': False} | ||||||
|
||||||
scale_factor = kwds.get('scale_factor', None) | ||||||
min_values = kwds.get('min_values', None) | ||||||
if min_values is None: | ||||||
data = X_num if Y is X or Y is None else np.vstack((X_num, Y_num)) | ||||||
trs = MinMaxScaler().fit(data) | ||||||
min_values = trs.min_ | ||||||
scale_factor = trs.scale_ | ||||||
|
||||||
return {'min_values': min_values, | ||||||
'scale_factor': scale_factor, | ||||||
'scale': True} | ||||||
adrinjalali marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
if metric == "seuclidean" and 'V' not in kwds: | ||||||
if X is Y: | ||||||
V = np.var(X, axis=0, ddof=1) | ||||||
|
@@ -1721,6 +1920,17 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None, | |||||
check_non_negative(X, whom=whom) | ||||||
return X | ||||||
elif metric in PAIRWISE_DISTANCE_FUNCTIONS: | ||||||
if metric == 'gower': | ||||||
""" | ||||||
# These convertions are necessary for matrices with string values | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's not clear why this code is commented out. |
||||||
if not isinstance(X, np.ndarray): | ||||||
X = np.asarray(X, dtype=np.object) | ||||||
if Y is not None and not isinstance(Y, np.ndarray): | ||||||
Y = np.asarray(Y, dtype=np.object) | ||||||
""" | ||||||
params = _precompute_metric_params(X, Y, metric=metric, **kwds) | ||||||
kwds.update(**params) | ||||||
|
||||||
func = PAIRWISE_DISTANCE_FUNCTIONS[metric] | ||||||
elif callable(metric): | ||||||
func = partial(_pairwise_callable, metric=metric, | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
here I'm basically converting the data to a numpy array only if it's not a pandas dataframe or an array already. Feels like it should be a
check_array(X, ..., accept_pandas=True)
call.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why avoid calling
check_array
if X is already an array?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair, but still need to avoid calling it if it's a pandas DF.