Skip to content

Commit

Permalink
Ranking Metrics - Better Precision/Recall/MRR calculation (#1492)
Browse files Browse the repository at this point in the history
## Description

This PR:

- Makes the metrics calculation (other than NDCG and AP, which are
already correct) correct for boolean and numeric scenarios
- Changes ndcg from a non-mergeable batch metric to a mergeable row-wise
metric
- Corrects top_rank to return None instead of 0 when there is no
relevant item (since lower is better for top_rank, assigning a 0 is not
correct)
- Changes MRR from a non-mergeable batch metric to a mergeable row-wise
metric (and renamed from `Mean Reciprocal Rank` to `Reciprocal Rank`)
- Refactors the code to group row-wise statistics calculations functions

After these changes, the only metric that makes the ranking metrics
non-mergeable is `accuracy_k`

- [x] I have reviewed the [Guidelines for Contributing](CONTRIBUTING.md)
and the [Code of Conduct](CODE_OF_CONDUCT.md).

---------

Co-authored-by: felipe207 <felipe@whylabs.ai>
  • Loading branch information
FelipeAdachi and felipe207 committed Mar 26, 2024
1 parent db15e49 commit 538eb05
Show file tree
Hide file tree
Showing 3 changed files with 137 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -206,19 +206,14 @@
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean_reciprocal_rank</th>\n",
" <td>1</td>\n",
" <td>0.333333</td>\n",
" </tr>\n",
" <tr>\n",
" <th>norm_dis_cumul_gain_k_3</th>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>precision_k_3</th>\n",
" <td>1</td>\n",
" <td>0.333333</td>\n",
" <td>0.666667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>predictions</th>\n",
Expand All @@ -231,14 +226,19 @@
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>reciprocal_rank</th>\n",
" <td>1</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>targets</th>\n",
" <td>1</td>\n",
" <td>0.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top_rank</th>\n",
" <td>1</td>\n",
" <td>3.000000</td>\n",
" <td>1.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
Expand All @@ -249,13 +249,13 @@
"column \n",
"accuracy_k_3 1 1.000000\n",
"average_precision_k_3 1 1.000000\n",
"mean_reciprocal_rank 1 0.333333\n",
"norm_dis_cumul_gain_k_3 1 1.000000\n",
"precision_k_3 1 0.333333\n",
"precision_k_3 1 0.666667\n",
"predictions 1 0.000000\n",
"recall_k_3 1 1.000000\n",
"reciprocal_rank 1 1.000000\n",
"targets 1 0.000000\n",
"top_rank 1 3.000000"
"top_rank 1 1.000000"
]
},
"execution_count": 4,
Expand Down
45 changes: 32 additions & 13 deletions python/tests/experimental/api/test_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def test_log_batch_ranking_metrics_single_simple():

column_names = [
"accuracy_k_3",
"mean_reciprocal_rank",
"reciprocal_rank",
"precision_k_3",
"recall_k_3",
"top_rank",
Expand All @@ -33,17 +33,22 @@ def test_log_batch_ranking_metrics_single_simple():
for col in column_names:
assert col in pandas_summary.index
assert pandas_summary.loc["accuracy_k_3", "counts/n"] == 1
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 4
assert pandas_summary.loc["precision_k_3", "counts/n"] == 4
assert pandas_summary.loc["recall_k_3", "counts/n"] == 4
assert pandas_summary.loc["top_rank", "counts/n"] == 4
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
assert pandas_summary.loc["average_precision_k_3", "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 1
assert pandas_summary.loc["norm_dis_cumul_gain_k_3", "counts/n"] == 4
# ndcg = [1, 0, 0.63, 0.5]
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_3", "distribution/mean"], 0.53273, abs_tol=0.00001)
assert isclose(pandas_summary.loc["average_precision_k_3", "distribution/mean"], 0.45833, abs_tol=0.00001)
assert isclose(pandas_summary.loc["precision_k_3", "distribution/mean"], 0.25, abs_tol=0.00001)
assert isclose(pandas_summary.loc["recall_k_3", "distribution/mean"], 1.0, abs_tol=0.00001)
# rr = [1, 0, 0.5, 0.33333]
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 0.45833, abs_tol=0.00001)
assert isclose(pandas_summary.loc["accuracy_k_3", "distribution/mean"], 0.75, abs_tol=0.00001)
assert isclose(pandas_summary.loc["sum_gain_k_3", "distribution/mean"], 0.75, abs_tol=0.00001)


def test_log_batch_ranking_metrics_binary_simple():
Expand All @@ -57,7 +62,7 @@ def test_log_batch_ranking_metrics_binary_simple():
k = 2
column_names = [
"accuracy_k_" + str(k),
"mean_reciprocal_rank",
"reciprocal_rank",
"precision_k_" + str(k),
"recall_k_" + str(k),
"top_rank",
Expand All @@ -67,16 +72,22 @@ def test_log_batch_ranking_metrics_binary_simple():
for col in column_names:
assert col in pandas_summary.index
assert pandas_summary.loc["accuracy_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 4
assert pandas_summary.loc["precision_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["recall_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["top_rank", "counts/n"] == 4
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 4
# ndcg@2 = [0.613147, 1.0, 1.0, 0.63093]
# average_precision_k_2 = [1.0, 0.0, 1.0, 0.5]
assert isclose(pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "distribution/mean"], 0.81101, abs_tol=0.00001)
assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 0.62500, abs_tol=0.00001)
assert isclose(pandas_summary.loc["precision_k_" + str(k), "distribution/mean"], 0.5, abs_tol=0.00001)
assert isclose(pandas_summary.loc["recall_k_" + str(k), "distribution/mean"], 0.83333, abs_tol=0.00001)
# rr = [1, 0, 1, 0.5]
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 0.625, abs_tol=0.00001)
assert isclose(pandas_summary.loc["accuracy_k_2", "distribution/mean"], 0.75, abs_tol=0.00001)
assert isclose(pandas_summary.loc["sum_gain_k_2", "distribution/mean"], 1.0, abs_tol=0.00001)


def test_log_batch_ranking_metrics_multiple_simple():
Expand Down Expand Up @@ -104,7 +115,7 @@ def test_log_batch_ranking_metrics_multiple_simple():

column_names = [
"accuracy_k_" + str(k),
"mean_reciprocal_rank",
"reciprocal_rank",
"precision_k_" + str(k),
"recall_k_" + str(k),
"top_rank",
Expand All @@ -114,16 +125,17 @@ def test_log_batch_ranking_metrics_multiple_simple():
for col in column_names:
assert col in pandas_summary.index
assert pandas_summary.loc["accuracy_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 4
assert pandas_summary.loc["precision_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["recall_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["top_rank", "counts/n"] == 4
assert pandas_summary.loc["average_precision_k_" + str(k), "counts/n"] == 4
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["norm_dis_cumul_gain_k_" + str(k), "counts/n"] == 4
# ndcg@3 = [0.9197, 0.0, 1.0, 0.386853]
# average_precision_k_3 = [0.83, 0.0, 1.0, 0.5]
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.57664, abs_tol=0.00001)
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/mean"], 0.57664, abs_tol=0.00001)
assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 0.58333, abs_tol=0.00001)
assert isclose(pandas_summary.loc["sum_gain_k_" + str(k), "distribution/mean"], 1.25, abs_tol=0.00001)


def test_log_batch_ranking_metrics_default_target():
Expand All @@ -135,7 +147,7 @@ def test_log_batch_ranking_metrics_default_target():
k = 3
column_names = [
"accuracy_k_" + str(k),
"mean_reciprocal_rank",
"reciprocal_rank",
"precision_k_" + str(k),
"recall_k_" + str(k),
"top_rank",
Expand All @@ -145,7 +157,7 @@ def test_log_batch_ranking_metrics_default_target():
for col in column_names:
assert col in pandas_summary.index
assert pandas_summary.loc["accuracy_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["mean_reciprocal_rank", "counts/n"] == 1
assert pandas_summary.loc["reciprocal_rank", "counts/n"] == 1
assert pandas_summary.loc["precision_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["recall_k_" + str(k), "counts/n"] == 1
assert pandas_summary.loc["top_rank", "counts/n"] == 1
Expand All @@ -155,6 +167,8 @@ def test_log_batch_ranking_metrics_default_target():
assert isclose(pandas_summary.loc[f"norm_dis_cumul_gain_k_{k}", "distribution/median"], 0.90130, abs_tol=0.00001)
# AP assumes binary relevance - this case doesn't raise an error, just a warning, but the result is not meaningful
assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 1.00000, abs_tol=0.00001)
assert isclose(pandas_summary.loc["accuracy_k_3", "distribution/mean"], 1.0, abs_tol=0.00001)
assert isclose(pandas_summary.loc["sum_gain_k_3", "distribution/mean"], 8.0, abs_tol=0.00001)


def test_log_batch_ranking_metrics_ranking_ndcg_wikipedia():
Expand Down Expand Up @@ -195,6 +209,10 @@ def test_log_batch_ranking_metrics_average_precision_sklearn_example():
pandas_summary = result.view().to_pandas()

assert isclose(pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], 0.83333, abs_tol=0.00001)
assert isclose(pandas_summary.loc["precision_k_" + str(k), "distribution/mean"], 0.5, abs_tol=0.00001)
assert isclose(pandas_summary.loc["recall_k_" + str(k), "distribution/mean"], 1.0, abs_tol=0.00001)
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 1.0, abs_tol=0.00001)
assert isclose(pandas_summary.loc["sum_gain_k_" + str(k), "distribution/mean"], 2.0, abs_tol=0.00001)


def test_log_batch_ranking_metrics_average_precision():
Expand All @@ -215,3 +233,4 @@ def test_log_batch_ranking_metrics_average_precision():
assert isclose(
pandas_summary.loc["average_precision_k_" + str(k), "distribution/mean"], res[1], abs_tol=0.00001
)
assert isclose(pandas_summary.loc["reciprocal_rank", "distribution/mean"], 0.45833, abs_tol=0.00001)

0 comments on commit 538eb05

Please sign in to comment.