Skip to content

Commit

Permalink
More tests and linting
Browse files Browse the repository at this point in the history
  • Loading branch information
Taqi Jaffri committed Mar 13, 2024
1 parent 8c431ad commit a2c82a8
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 11 deletions.
2 changes: 1 addition & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"tests",
"--doctest-modules",
"tests",
"docugami"
"docugami_dfm_benchmarks"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
Expand Down
22 changes: 12 additions & 10 deletions docugami_dfm_benchmarks/utils/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
KEY_GT = "Ground Truth"


def _finalize_scores(scores, total_rows):
def _finalize_scores(scores: dict[str, Any], total_rows: int) -> None:
"""
Normalizes scores by the total number of rows and calculates the average F1 score.
Expand All @@ -25,15 +25,17 @@ def _finalize_scores(scores, total_rows):
"""
avg_f1 = 0
for metric in list(scores):
if metric != "f1_per_row":
scores[metric] /= total_rows
else:
if metric == "f1_per_row":
avg_f1 = np.mean(scores[metric]) * 100
else:
scores[metric] /= total_rows

scores["avg_f1"] = avg_f1


def _compute_scores_for_column(gt_annotations, model_outputs):
def _compute_scores_for_column(
gt_annotations: list[str], model_outputs: list[str]
) -> dict[str, Any]:
"""
Computes the scores for a single column given lists of ground truth annotations and model outputs.
"""
Expand All @@ -49,19 +51,19 @@ def _compute_scores_for_column(gt_annotations, model_outputs):
gt_annotation = normalize(gt_annotation)
model_output = normalize(model_output)

scores["f1_per_row"].append(compute_f1(gt_annotation, model_output))
scores["f1_per_row"].append(compute_f1(gt_annotation, model_output)) # type: ignore

if gt_annotation == model_output:
scores["exact_match"] += 1
scores["exact_match"] += 1 # type: ignore
elif not model_output and gt_annotation:
scores["no_output"] += 1
scores["no_output"] += 1 # type: ignore

if gt_annotation and model_output:
similarity = semantic_similarity(gt_annotation, model_output)
if similarity >= 0.8:
scores[f"{SIM_TITLE}0.8"] += 1
scores[f"{SIM_TITLE}0.8"] += 1 # type: ignore
if similarity >= 0.6:
scores[f"{SIM_TITLE}0.6"] += 1
scores[f"{SIM_TITLE}0.6"] += 1 # type: ignore

return scores

Expand Down
50 changes: 50 additions & 0 deletions tests/utils/test_scorer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import numpy as np

from docugami_dfm_benchmarks.utils.scorer import _finalize_scores, score_by_column
from docugami_dfm_benchmarks.utils.similarity import SIM_TITLE


def test_finalize_scores() -> None:
scores = {"exact_match": 2, "no_output": 1, "f1_per_row": np.array([1, 0.5, 0.75])}
total_rows = 3
_finalize_scores(scores, total_rows)
assert scores["exact_match"] == 2 / 3
assert scores["no_output"] == 1 / 3
assert scores["avg_f1"] == np.mean([100, 50, 75])


def test_score_by_column() -> None:
data = [
{
"Ground Truth": "Test sentence.",
"Model A": "Test sentence.",
"Model B": "test sentence",
},
{
"Ground Truth": "Another test.",
"Model A": "A different sentence.",
"Model B": "",
},
]
expected_scores = {
"Model A": {
"avg_f1": 50.0,
"exact_match": 0.5,
"no_output": 0,
f"{SIM_TITLE}0.8": 0.5,
f"{SIM_TITLE}0.6": 0.5,
},
"Model B": {
"avg_f1": 50.0,
"exact_match": 0.5,
"no_output": 0.5,
f"{SIM_TITLE}0.8": 0.5,
f"{SIM_TITLE}0.6": 0.5,
},
}
scores = score_by_column(data)
for column in expected_scores:
for metric in expected_scores[column]:
assert np.isclose(
scores[column][metric], expected_scores[column][metric], atol=0.01
)

0 comments on commit a2c82a8

Please sign in to comment.