NicolasHug · gautamramk · Apr 7, 2018 · May 1, 2018 · May 1, 2018 · Jun 27, 2018
diff --git a/surprise/prediction_algorithms/algo_base.py b/surprise/prediction_algorithms/algo_base.py
@@ -269,7 +269,8 @@ def compute_similarities(self, verbose=False):
         construction_func = {'cosine': sims.cosine,
                              'msd': sims.msd,
                              'pearson': sims.pearson,
-                             'pearson_baseline': sims.pearson_baseline}
+                             'pearson_baseline': sims.pearson_baseline
+                             'spearman': sims.spearman}
 
         if self.sim_options['user_based']:
             n_x, yr = self.trainset.n_users, self.trainset.ir

diff --git a/surprise/similarities.pyx b/surprise/similarities.pyx
@@ -24,6 +24,8 @@ import numpy as np
 from six.moves import range
 from six import iteritems
 
+from scipy.stats import rankdata
+
 
 def cosine(n_x, yr, min_support):
     """Compute the cosine similarity between all pairs of users (or items).
@@ -359,3 +361,103 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
             sim[xj, xi] = sim[xi, xj]
 
     return sim
+
+
+def spearman(n_x, yr, min_support):
+    """Compute the Spearman correlation coefficient between all pairs of users
+    (or items).
+
+    Only **common** users (or items) are taken into account. The Spearman
+    correlation coefficient can be seen as a non parametric Pearson's
+    Similarity, and is defined as:
+
+    .. math ::
+        \\text{spearman_sim}(u, v) = \\frac{ \\sum\\limits_{i \in I_{uv}}
+        (k_{ui} -  \mu_u) \cdot (k_{vi} - \mu_{v})} {\\sqrt{\\sum\\limits_{i
+        \in I_{uv}} (r_{ui} -  \mu_u)^2} \cdot \\sqrt{\\sum\\limits_{i \in
+        I_{uv}} (r_{vi} -  \mu_{v})^2} }
+
+    or
+
+    .. math ::
+        \\text{spearman_sim}(i, j) = \\frac{ \\sum\\limits_{u \in U_{ij}}
+        (k_{ui} -  \mu_i) \cdot (k_{uj} - \mu_{j})} {\\sqrt{\\sum\\limits_{u
+        \in U_{ij}} (k_{ui} -  \mu_i)^2} \cdot \\sqrt{\\sum\\limits_{u \in
+        U_{ij}} (k_{uj} -  \mu_{j})^2} }
+
+    depending on the ``user_based`` field of ``sim_options`` (see
+    :ref:`similarity_measures_configuration`).
+
+
+    Note: if there are no common users or items, similarity will be 0 (and not
+    -1).
+
+    For details on Spearman coefficient, see `Wikipedia
+    <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`__.
+
+    """
+
+    # number of common ys
+    cdef np.ndarray[np.int_t, ndim=2] freq
+    # sum (rank_xy * rank_x'y) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] prods
+    # sum (rank_xy ^ 2) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sqi
+    # sum (rank_x'y ^ 2) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sqj
+    # sum (rank_xy) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] si
+    # sum (rank_x'y) for common ys
+    cdef np.ndarray[np.double_t, ndim=2] sj
+    # the similarity matrix
+    cdef np.ndarray[np.double_t, ndim=2] sim
+
+    cdef np.ndarray[np.int, ndim=1] ranks
+    cdef np.ndarray[np.int, ndim=1] rows
+
+    cdef int xi, xj
+    cdef double ri, rj
+    cdef int min_sprt = min_support
+
+    freq = np.zeros((n_x, n_x), np.int)
+    prods = np.zeros((n_x, n_x), np.double)
+    sqi = np.zeros((n_x, n_x), np.double)
+    sqj = np.zeros((n_x, n_x), np.double)
+    si = np.zeros((n_x, n_x), np.double)
+    sj = np.zeros((n_x, n_x), np.double)
+    sim = np.zeros((n_x, n_x), np.double)
+    ranks = np.zeros(n_x, np.int)
+    rows = np.zeros(n_x, np.int)
+
+    for y, y_ratings in iteritems(yr):
+        for xi, ri in y_ratings:
+            rows[xi] = ri
+        ranks = rankdata(rows)
+        for xi in range(n_x):
+            for xj in range(n_x):
+                prods[xi, xj] += ranks[xi] * ranks[xj]
+                freq[xi, xj] += 1
+                sqi[xi, xj] += ranks[xi]**2
+                sqj[xi, xj] += ranks[xj]**2
+                si[xi, xj] += ranks[xi]
+                sj[xi, xj] += ranks[xj]
+
+    for xi in range(n_x):
+        sim[xi, xi] = 1
+        for xj in range(xi + 1, n_x):
+
+            if freq[xi, xj] < min_sprt:
+                sim[xi, xj] == 0
+            else:
+                n = freq[xi, xj]
+                num = n * prods[xi, xj] - si[xi, xj] * sj[xi, xj]
+                denum = np.sqrt((n * sqi[xi, xj] - si[xi, xj]**2) *
+                                (n * sqj[xi, xj] - sj[xi, xj]**2))
+                if denum == 0:
+                    sim[xi, xj] = 0
+                else:
+                    sim[xi, xj] = num / denum
+
+            sim[xj, xi] = sim[xi, xj]
+
+    return sim
diff --git a/tests/test_similarities.py b/tests/test_similarities.py
@@ -12,11 +12,11 @@
 
 n_x = 8
 yr_global = {
-    0: [(0, 3), (1, 3), (2, 3), (5, 1),                 (6, 1.5), (7, 3)],  # noqa
+    0: [(0, 3), (1, 3), (2, 3),                 (5, 1), (6, 1.5), (7, 3)],  # noqa
     1: [(0, 4), (1, 4), (2, 4),                                         ],  # noqa
     2: [                (2, 5), (3, 2), (4, 3)                          ],  # noqa
-    3: [(1, 1),         (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)],  # noqa
-    4: [(1, 5),         (2, 1),                 (5, 2), (6, 2.5), (7, 2.5)], # noqa
+    3: [        (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)],  # noqa
+    4: [        (1, 5), (2, 1),                 (5, 2), (6, 2.5), (7, 2.5)], # noqa
 }
 
 
@@ -205,3 +205,60 @@ def test_pearson_baseline_sim():
         for j in range(i + 1, n_x):
             if i != 1 and j != 2:
                 assert sim[i, j] == 0
+
+def test_spearman_sim():
+    """Test for spearman similarity"""
+
+    yr = yr_global.copy()
+
+    # shuffle every rating list, to ensure the order in which ratings are
+    # processed does not matter (it's important because it used to be error
+    # prone when we were using itertools.combinations)
+    for _, ratings in yr.items():
+        random.shuffle(ratings)
+
+    sim = sims.spearman(n_x, yr, min_support=1)
+    # check symetry and bounds. -1 <= pearson coeff <= 1
+    for xi in range(n_x):
+        assert sim[xi, xi] == 1
+        for xj in range(n_x):
+            assert sim[xi, xj] == sim[xj, xi]
+            assert -1 <= sim[xi, xj] <= 1
+
+    # on common items, users 0, 1 and 2 have the same ratings
+    assert sim[0, 1] == 1
+    assert sim[0, 2] == 1
+
+    # for vectors with constant ratings, pearson sim is necessarily zero (as
+    # ratings are centered)
+    assert sim[3, 4] == 0
+    assert sim[2, 3] == 0
+    assert sim[2, 4] == 0
+
+    # pairs of users (0, 3), have no common items
+    assert sim[0, 3] == 0
+    assert sim[0, 4] == 0
+
+    # ratings have same rankings
+    assert sim[5, 6] == 1
+
+    # check for float point support and computation correctness
+    mean6 = (1 + 2 + 3) / 3
+    var6 = (3 - mean6) ** 2 + (1 - mean6) ** 2 + (2 - mean6) ** 2
+    mean7 = (1 + 2 + 3) / 3
+    var7 = (1 - mean7) ** 2 + (3 - mean7) ** 2 + (2 - mean7) ** 2
+    num = sum([((3 - mean6) * (1 - mean7)),
+               ((1 - mean6) * (3 - mean7)),
+               ((2 - mean6) * (2 - mean7))
+               ])
+    assert sim[6, 7] == num / (var6 * var7) ** 0.5
+
+    # ensure min_support is taken into account. Only users 1 and 2 have more
+    # than 4 common ratings.
+    sim = sims.spearman(n_x, yr, min_support=4)
+    for i in range(n_x):
+        for j in range(i + 1, n_x):
+            if i != 1 and j != 2:
+                assert sim[i, j] == 0
+
+