scikit-learn · adrinjalali · Dec 20, 2018 · Dec 20, 2018 · Apr 10, 2019 · Apr 25, 2019
diff --git a/doc/conftest.py b/doc/conftest.py
@@ -50,6 +50,13 @@ def setup_compose():
         raise SkipTest("Skipping compose.rst, pandas not installed")
 
 
+def setup_metrics():
+    try:
+        import pandas  # noqa
+    except ImportError:
+        raise SkipTest("Skipping metrics.rst, pandas not installed")
+
+
 def setup_impute():
     try:
         import pandas  # noqa
@@ -82,6 +89,8 @@ def pytest_runtest_setup(item):
         setup_working_with_text_data()
     elif fname.endswith('modules/compose.rst') or is_index:
         setup_compose()
+    elif fname.endswith('modules/metrics.rst') or is_index:
+        setup_metrics()
     elif IS_PYPY and fname.endswith('modules/feature_extraction.rst'):
         raise SkipTest('FeatureHasher is not compatible with PyPy')
     elif fname.endswith('modules/impute.rst'):

diff --git a/doc/modules/metrics.rst b/doc/modules/metrics.rst
@@ -93,6 +93,61 @@ is equivalent to :func:`linear_kernel`, only slower.)
       Information Retrieval. Cambridge University Press.
       https://nlp.stanford.edu/IR-book/html/htmledition/the-vector-space-model-for-scoring-1.html
 
+.. _gower_distances:
+
+Gower distances
+-----------------
+
+The function :func:`~sklearn.metrics.pairwise.gower_distances` computes the
+distances between the observations in X and Y, that may contain combinations of
+numerical, boolean, or categorical attributes, using an implementation of Gower
+Similarity.
+
+.. math::
+
+    g(\mathbf{x}, \mathbf{y}) = \frac{\sum_i(s(x_i, y_i))}{|\{i| x_i \neq \text{missing} \land y_i \neq \text{missing}\}|}
+
+Where:
+
+:math:`x, y` : array_like of shape (n_features,) are the observations to be compared.
+
+:math:`s(x_i, y_i)` : Calculates the distance as:
+
+    - :math:`s(x_i, y_i) := 0`, if either :math:`x_i` or :math:`y_i` are missing.
+    - :math:`s(x_i, y_i) := \text{int}(x_i == y_i)`, if :math:`i` represents a
+      boolean or categorical attribute.
+    - :math:`s(x_i, y_i) := abs(x_i - y_i)`, if :math:`i` represents a numerical
+      attribute.
+
+
+The Gower formula combines a Manhattan (L1) distance for numeric features
+with Hamming distance for categorical features to obtain a general coefficient
+for categorical and numeric data.
+
+The :func:`gower_distances` function expects the user to specify the
+categorical features, otherwise it will assume all features are numerical. If
+the data is a `pandas.DataFrame`, you can use
+:func:`~sklearn.compose.make_column_selector` to select features::
+
+    >>> import pandas as pd # doctest: +ELLIPSIS
+    >>> from sklearn.compose import make_column_selector as selector
+    >>> from sklearn.metrics.pairwise import gower_distances
+    >>> X = pd.DataFrame(
+    ...     {'city': ['London', 'London', 'Paris', 'Sallisaw'],
+    ...      'expert_rating': [5, 3, 4, 5],
+    ...      'user_rating': [4, 5, 4, 3]})
+    >>> gower_distances(X, categorical_features=selector(dtype_include=object))
+    array([[0.        , 0.5      , 0.5      , 0.5      ],
+           [0.5       , 0.       , 0.666... , 1.       ],
+           [0.5       , 0.666... , 0.       , 0.666... ],
+           [0.5       , 1.       , 0.666... , 0.       ]])
+
+.. topic:: References:
+
+    * Gower, J.C., 1971, A General Coefficient of Similarity and Some of Its 
+      Properties, Biometrics, Vol. 27, No. 4. (Dec., 1971), pp. 857-871.
+      http://members.cbio.mines-paristech.fr/~jvert/svn/bibli/local/Gower1971general.pdf
+
 .. _linear_kernel:
 
 Linear kernel

diff --git a/sklearn/metrics/pairwise.py b/sklearn/metrics/pairwise.py
@@ -21,16 +21,21 @@
 
 from ..utils.validation import _num_samples
 from ..utils.validation import check_non_negative
+from ..utils.validation import check_consistent_length
 from ..utils import check_array
 from ..utils import gen_even_slices
 from ..utils import gen_batches, get_chunk_n_rows
 from ..utils import is_scalar_nan
+from ..utils import _safe_indexing
+from ..utils import _get_column_indices
 from ..utils.extmath import row_norms, safe_sparse_dot
 from ..preprocessing import normalize
 from ..utils._mask import _get_mask
 
 from ._pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
 from ..exceptions import DataConversionWarning
+from ..utils.fixes import _object_dtype_isnan
+from ..preprocessing import MinMaxScaler
 
 
 # Utility Functions
@@ -544,7 +549,7 @@ def pairwise_distances_argmin_min(X, Y, axis=1, metric="euclidean",
         Valid values for metric are:
 
         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-          'manhattan']
+          'manhattan', 'gower']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -632,7 +637,7 @@ def pairwise_distances_argmin(X, Y, axis=1, metric="euclidean",
         Valid values for metric are:
 
         - from scikit-learn: ['cityblock', 'cosine', 'euclidean', 'l1', 'l2',
-          'manhattan']
+          'manhattan', 'gower']
 
         - from scipy.spatial.distance: ['braycurtis', 'canberra', 'chebyshev',
           'correlation', 'dice', 'hamming', 'jaccard', 'kulsinski',
@@ -829,6 +834,172 @@ def cosine_distances(X, Y=None):
     return S
 
 
+def _split_categorical_numerical(X, categorical_features):
+    # the following bit is done before check_pairwise_array to avoid converting
+    # numerical data to object dtype. First we split the data into categorical
+    # and numerical, then we do check_array
+
+    if X is None:
+        return None, None
+
+    # TODO: this should be more like check_array(..., accept_pandas=True)
+    if not hasattr(X, "shape"):
+        X = check_array(X, dtype=np.object, force_all_finite=False)
+
+    if callable(categorical_features):
+        cols = categorical_features(X)
+    else:
+        cols = categorical_features
+    if cols is None:
+        cols = []
+
+    col_idx = _get_column_indices(X, cols)
+    X_cat = _safe_indexing(X, col_idx, axis=1)
+    X_num = _safe_indexing(X, col_idx, axis=1, complement=True)
+
+    return X_cat, X_num
+
+
+def gower_distances(X, Y=None, categorical_features=None, scale=True,
+                    min_values=None, scale_factor=None):
+    """Compute the distances between the observations in X and Y,
+    that may contain mixed types of data, using an implementation
+    of Gower formula.
+
+    Parameters
+    ----------
+    X : {array-like, pandas.DataFrame} of shape (n_samples, n_features)
+
+    Y : {array-like, pandas.DataFrame} of shape (n_samples, n_features), \
+        default=None
+
+    categorical_features : array-like of str, array-like of int, \
+            array-like of bool, slice or callable, default=None
+        Indexes the data on its second axis. Integers are interpreted as
+        positional columns, while strings can reference DataFrame columns
+        by name.
+        A callable is passed the input data `X` and can return any of the
+        above. To select multiple columns by name or dtype, you can use
+        :obj:`~sklearn.compose.make_column_selector`.
+
+        By default all non-numeric columns are considered categorical.
+
+    scale : bool, default=True
+        Indicates if the numerical columns will be scaled between 0 and 1.
+        If false, it is assumed the numerical columns are already scaled.
+        The scaling factors, _i.e._ min and max, are taken from both ``X`` and
+        ``Y``.
+
+    min_values : ndarray of shape (n_features,), default=None
+        Per feature adjustment for minimum. Equivalent to
+        ``min_values - X.min(axis=0) * scale_factor``
+        If provided, ``scale_factor`` should be provided as well.
+
+    scale_factor : ndarray of shape (n_features,), default=None
+        Per feature relative scaling of the data. Equivalent to
+        ``(max_values - min_values) / (X.max(axis=0) - X.min(axis=0))``
+        If provided, ``min_values`` should be provided as well.
+
+    Returns
+    -------
+    distances : ndarray of shape (n_samples_X, n_samples_Y)
+
+    References
+    ----------
+    Gower, J.C., 1971, A General Coefficient of Similarity and Some of Its
+    Properties.
+
+    Notes
+    -----
+    Categorical ordinal attributes should be treated as numeric for the purpose
+    of Gower similarity.
+
+    Current implementation does not support sparse matrices.
+
+    All the non-numerical types (e.g., str), are treated as categorical
+    features.
+
+    This implementation modifies the Gower's original similarity measure in
+    the folowing aspects:
+
+    * The values in the original similarity S range between 0 and 1. To
+    guarantee this, it is assumed the numerical features of X and Y are
+    scaled between 0 and 1.
+
+    * Different from the original similarity S, this implementation
+    returns 1-S.
+    """
+    def _nanmanhatan(x, y):
+        return np.nansum(np.abs(x - y))
+
+    def _non_nans(x, y):
+        return np.sum(~_object_dtype_isnan(x) & ~_object_dtype_isnan(y))
+
+    def _nanhamming(x, y):
+        return np.sum(x != y) - np.sum(
+            _object_dtype_isnan(x) | _object_dtype_isnan(y))
+
+    if issparse(X) or issparse(Y):
+        raise TypeError("Gower distance does not support sparse matrices")
+
+    if X is None or len(X) == 0:
+        raise ValueError("X can not be None or empty")
+
+    if scale:
+        if (scale_factor is None) != (min_values is None):
+            raise ValueError("min_value and scale_factor should be provided "
+                             "together.")
+    X_cat, X_num = _split_categorical_numerical(X, categorical_features)
+    Y_cat, Y_num = _split_categorical_numerical(Y, categorical_features)
+
+    if min_values is not None:
+        min_values = np.asarray(min_values)
+        scale_factor = np.asarray(scale_factor)
+        check_consistent_length(min_values, scale_factor,
+                                np.ndarray(shape=(X_num.shape[1], 0)))
+
+    if X_num.shape[1]:
+        X_num, Y_num = check_pairwise_arrays(X_num, Y_num, precomputed=False,
+                                             dtype=float,
+                                             force_all_finite=False)
+        if scale:
+            scale_data = X_num if Y_num is X_num else np.vstack((X_num, Y_num))
+            if scale_factor is None:
+                trs = MinMaxScaler().fit(scale_data)
+            else:
+                trs = MinMaxScaler()
+                trs.scale_ = scale_factor
+                trs.min_ = min_values
+            X_num = trs.transform(X_num)
+            Y_num = trs.transform(Y_num)
+
+        nan_manhatan = distance.cdist(X_num, Y_num, _nanmanhatan)
+        valid_num = distance.cdist(X_num, Y_num, _non_nans)
+    else:
+        nan_manhatan = valid_num = None
+
+    if X_cat.shape[1]:
+        X_cat, Y_cat = check_pairwise_arrays(X_cat, Y_cat, precomputed=False,
+                                             dtype=np.object,
+                                             force_all_finite=False)
+        nan_hamming = distance.cdist(X_cat, Y_cat, _nanhamming)
+        valid_cat = distance.cdist(X_cat, Y_cat, _non_nans)
+    else:
+        nan_hamming = valid_cat = None
+
+    # based on whether there are categorical and/or numerical data present,
+    # we compute the distance metric
+    # Division by zero and nans warnings are ignored since they are expected
+    with np.errstate(divide='ignore', invalid='ignore'):
+        if valid_num is not None and valid_cat is not None:
+            D = (nan_manhatan + nan_hamming) / (valid_num + valid_cat)
+        elif valid_num is not None:
+            D = nan_manhatan / valid_num
+        else:
+            D = nan_hamming / valid_cat
+    return D
+
+
 # Paired distances
 def paired_euclidean_distances(X, Y):
     """
@@ -905,7 +1076,7 @@ def paired_cosine_distances(X, Y):
     'l2': paired_euclidean_distances,
     'l1': paired_manhattan_distances,
     'manhattan': paired_manhattan_distances,
-    'cityblock': paired_manhattan_distances}
+    'cityblock': paired_manhattan_distances, }
 
 
 def paired_distances(X, Y, metric="euclidean", **kwds):
@@ -1298,6 +1469,7 @@ def chi2_kernel(X, Y=None, gamma=1.):
     'l2': euclidean_distances,
     'l1': manhattan_distances,
     'manhattan': manhattan_distances,
+    'gower': gower_distances,
     'precomputed': None,  # HACK: precomputed is always allowed, never called
     'nan_euclidean': nan_euclidean_distances,
 }
@@ -1322,6 +1494,7 @@ def distance_metrics():
     'l1'            metrics.pairwise.manhattan_distances
     'l2'            metrics.pairwise.euclidean_distances
     'manhattan'     metrics.pairwise.manhattan_distances
+    'gower'          metrics.pairwise.gower_distances
     'nan_euclidean' metrics.pairwise.nan_euclidean_distances
     =============== ========================================
 
@@ -1400,7 +1573,7 @@ def _pairwise_callable(X, Y, metric, force_all_finite=True, **kwds):
                   'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto',
                   'russellrao', 'seuclidean', 'sokalmichener',
                   'sokalsneath', 'sqeuclidean', 'yule', "wminkowski",
-                  'nan_euclidean', 'haversine']
+                  'nan_euclidean', 'haversine', 'gower']
 
 _NAN_METRICS = ['nan_euclidean']
 
@@ -1429,6 +1602,28 @@ def _check_chunk_size(reduced, chunk_size):
 def _precompute_metric_params(X, Y, metric=None, **kwds):
     """Precompute data-derived metric parameters if not provided
     """
+    if metric == 'gower':
+        categorical_features = kwds.get('categorical_features', None)
+
+        _, X_num = _split_categorical_numerical(X, categorical_features)
+        _, Y_num = _split_categorical_numerical(Y, categorical_features)
+
+        scale = kwds.get('scale', True)
+        if not scale:
+            return {'min_values': None, 'scale_factor': None, 'scale': False}
+
+        scale_factor = kwds.get('scale_factor', None)
+        min_values = kwds.get('min_values', None)
+        if min_values is None:
+            data = X_num if Y is X or Y is None else np.vstack((X_num, Y_num))
+            trs = MinMaxScaler().fit(data)
+            min_values = trs.min_
+            scale_factor = trs.scale_
+
+        return {'min_values': min_values,
+                'scale_factor': scale_factor,
+                'scale': True}
+
     if metric == "seuclidean" and 'V' not in kwds:
         if X is Y:
             V = np.var(X, axis=0, ddof=1)
@@ -1721,6 +1916,17 @@ def pairwise_distances(X, Y=None, metric="euclidean", n_jobs=None,
         check_non_negative(X, whom=whom)
         return X
     elif metric in PAIRWISE_DISTANCE_FUNCTIONS:
+        if metric == 'gower':
+            """
+            # These convertions are necessary for matrices with string values
-            # These convertions are necessary for matrices with string values
+            # These conversions are necessary for matrices with string values
-            # These convertions are necessary for matrices with string values
+            # These conversions are necessary for matrices with string values
+            if not isinstance(X, np.ndarray):
+                X = np.asarray(X, dtype=np.object)
+            if Y is not None and not isinstance(Y, np.ndarray):
+                Y = np.asarray(Y, dtype=np.object)
+            """
+            params = _precompute_metric_params(X, Y, metric=metric, **kwds)
+            kwds.update(**params)
+
         func = PAIRWISE_DISTANCE_FUNCTIONS[metric]
     elif callable(metric):
         func = partial(_pairwise_callable, metric=metric,