Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Spearman Correlation for similarities module. #168

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion surprise/prediction_algorithms/algo_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ def compute_similarities(self, verbose=False):
construction_func = {'cosine': sims.cosine,
'msd': sims.msd,
'pearson': sims.pearson,
'pearson_baseline': sims.pearson_baseline}
'pearson_baseline': sims.pearson_baseline
'spearman': sims.spearman}

if self.sim_options['user_based']:
n_x, yr = self.trainset.n_users, self.trainset.ir
Expand Down
102 changes: 102 additions & 0 deletions surprise/similarities.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ import numpy as np
from six.moves import range
from six import iteritems

from scipy.stats import rankdata


def cosine(n_x, yr, min_support):
"""Compute the cosine similarity between all pairs of users (or items).
Expand Down Expand Up @@ -359,3 +361,103 @@ def pearson_baseline(n_x, yr, min_support, global_mean, x_biases, y_biases,
sim[xj, xi] = sim[xi, xj]

return sim


def spearman(n_x, yr, min_support):
"""Compute the Spearman correlation coefficient between all pairs of users
(or items).

Only **common** users (or items) are taken into account. The Spearman
correlation coefficient can be seen as a non parametric Pearson's
Similarity, and is defined as:

.. math ::
\\text{spearman_sim}(u, v) = \\frac{ \\sum\\limits_{i \in I_{uv}}
(k_{ui} - \mu_u) \cdot (k_{vi} - \mu_{v})} {\\sqrt{\\sum\\limits_{i
\in I_{uv}} (r_{ui} - \mu_u)^2} \cdot \\sqrt{\\sum\\limits_{i \in
I_{uv}} (r_{vi} - \mu_{v})^2} }

or

.. math ::
\\text{spearman_sim}(i, j) = \\frac{ \\sum\\limits_{u \in U_{ij}}
(k_{ui} - \mu_i) \cdot (k_{uj} - \mu_{j})} {\\sqrt{\\sum\\limits_{u
\in U_{ij}} (k_{ui} - \mu_i)^2} \cdot \\sqrt{\\sum\\limits_{u \in
U_{ij}} (k_{uj} - \mu_{j})^2} }

depending on the ``user_based`` field of ``sim_options`` (see
:ref:`similarity_measures_configuration`).


Note: if there are no common users or items, similarity will be 0 (and not
-1).

For details on Spearman coefficient, see `Wikipedia
<https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`__.

"""

# number of common ys
cdef np.ndarray[np.int_t, ndim=2] freq
# sum (rank_xy * rank_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] prods
# sum (rank_xy ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqi
# sum (rank_x'y ^ 2) for common ys
cdef np.ndarray[np.double_t, ndim=2] sqj
# sum (rank_xy) for common ys
cdef np.ndarray[np.double_t, ndim=2] si
# sum (rank_x'y) for common ys
cdef np.ndarray[np.double_t, ndim=2] sj
# the similarity matrix
cdef np.ndarray[np.double_t, ndim=2] sim

cdef np.ndarray[np.int, ndim=1] ranks
cdef np.ndarray[np.int, ndim=1] rows

cdef int xi, xj
cdef double ri, rj
cdef int min_sprt = min_support

freq = np.zeros((n_x, n_x), np.int)
prods = np.zeros((n_x, n_x), np.double)
sqi = np.zeros((n_x, n_x), np.double)
sqj = np.zeros((n_x, n_x), np.double)
si = np.zeros((n_x, n_x), np.double)
sj = np.zeros((n_x, n_x), np.double)
sim = np.zeros((n_x, n_x), np.double)
ranks = np.zeros(n_x, np.int)
rows = np.zeros(n_x, np.int)

for y, y_ratings in iteritems(yr):
for xi, ri in y_ratings:
rows[xi] = ri
ranks = rankdata(rows)
for xi in range(n_x):
for xj in range(n_x):
prods[xi, xj] += ranks[xi] * ranks[xj]
freq[xi, xj] += 1
sqi[xi, xj] += ranks[xi]**2
sqj[xi, xj] += ranks[xj]**2
si[xi, xj] += ranks[xi]
sj[xi, xj] += ranks[xj]

for xi in range(n_x):
sim[xi, xi] = 1
for xj in range(xi + 1, n_x):

if freq[xi, xj] < min_sprt:
sim[xi, xj] == 0
else:
n = freq[xi, xj]
num = n * prods[xi, xj] - si[xi, xj] * sj[xi, xj]
denum = np.sqrt((n * sqi[xi, xj] - si[xi, xj]**2) *
(n * sqj[xi, xj] - sj[xi, xj]**2))
if denum == 0:
sim[xi, xj] = 0
else:
sim[xi, xj] = num / denum

sim[xj, xi] = sim[xi, xj]

return sim
63 changes: 60 additions & 3 deletions tests/test_similarities.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@

n_x = 8
yr_global = {
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
0: [(0, 3), (1, 3), (2, 3), (5, 1), (6, 1.5), (7, 3)], # noqa
1: [(0, 4), (1, 4), (2, 4), ], # noqa
2: [ (2, 5), (3, 2), (4, 3) ], # noqa
3: [(1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [(1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
3: [ (1, 1), (2, 4), (3, 2), (4, 3), (5, 3), (6, 3.5), (7, 2)], # noqa
4: [ (1, 5), (2, 1), (5, 2), (6, 2.5), (7, 2.5)], # noqa
}


Expand Down Expand Up @@ -205,3 +205,60 @@ def test_pearson_baseline_sim():
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0

def test_spearman_sim():
"""Test for spearman similarity"""

yr = yr_global.copy()

# shuffle every rating list, to ensure the order in which ratings are
# processed does not matter (it's important because it used to be error
# prone when we were using itertools.combinations)
for _, ratings in yr.items():
random.shuffle(ratings)

sim = sims.spearman(n_x, yr, min_support=1)
# check symetry and bounds. -1 <= pearson coeff <= 1
for xi in range(n_x):
assert sim[xi, xi] == 1
for xj in range(n_x):
assert sim[xi, xj] == sim[xj, xi]
assert -1 <= sim[xi, xj] <= 1

# on common items, users 0, 1 and 2 have the same ratings
assert sim[0, 1] == 1
assert sim[0, 2] == 1

# for vectors with constant ratings, pearson sim is necessarily zero (as
# ratings are centered)
assert sim[3, 4] == 0
assert sim[2, 3] == 0
assert sim[2, 4] == 0

# pairs of users (0, 3), have no common items
assert sim[0, 3] == 0
assert sim[0, 4] == 0

# ratings have same rankings
assert sim[5, 6] == 1

# check for float point support and computation correctness
mean6 = (1 + 2 + 3) / 3
var6 = (3 - mean6) ** 2 + (1 - mean6) ** 2 + (2 - mean6) ** 2
mean7 = (1 + 2 + 3) / 3
var7 = (1 - mean7) ** 2 + (3 - mean7) ** 2 + (2 - mean7) ** 2
num = sum([((3 - mean6) * (1 - mean7)),
((1 - mean6) * (3 - mean7)),
((2 - mean6) * (2 - mean7))
])
assert sim[6, 7] == num / (var6 * var7) ** 0.5

# ensure min_support is taken into account. Only users 1 and 2 have more
# than 4 common ratings.
sim = sims.spearman(n_x, yr, min_support=4)
for i in range(n_x):
for j in range(i + 1, n_x):
if i != 1 and j != 2:
assert sim[i, j] == 0