More tests and linting

docugami · Mar 13, 2024 · a2c82a8 · a2c82a8
1 parent 8c431ad
commit a2c82a8
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 11 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -3,7 +3,7 @@
         "tests",
         "--doctest-modules",
         "tests",
-        "docugami"
+        "docugami_dfm_benchmarks"
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true

diff --git a/docugami_dfm_benchmarks/utils/scorer.py b/docugami_dfm_benchmarks/utils/scorer.py
@@ -13,7 +13,7 @@
 KEY_GT = "Ground Truth"
 
 
-def _finalize_scores(scores, total_rows):
+def _finalize_scores(scores: dict[str, Any], total_rows: int) -> None:
     """
     Normalizes scores by the total number of rows and calculates the average F1 score.
 
@@ -25,15 +25,17 @@ def _finalize_scores(scores, total_rows):
     """
     avg_f1 = 0
     for metric in list(scores):
-        if metric != "f1_per_row":
-            scores[metric] /= total_rows
-        else:
+        if metric == "f1_per_row":
             avg_f1 = np.mean(scores[metric]) * 100
+        else:
+            scores[metric] /= total_rows
 
     scores["avg_f1"] = avg_f1
 
 
-def _compute_scores_for_column(gt_annotations, model_outputs):
+def _compute_scores_for_column(
+    gt_annotations: list[str], model_outputs: list[str]
+) -> dict[str, Any]:
     """
     Computes the scores for a single column given lists of ground truth annotations and model outputs.
     """
@@ -49,19 +51,19 @@ def _compute_scores_for_column(gt_annotations, model_outputs):
         gt_annotation = normalize(gt_annotation)
         model_output = normalize(model_output)
 
-        scores["f1_per_row"].append(compute_f1(gt_annotation, model_output))
+        scores["f1_per_row"].append(compute_f1(gt_annotation, model_output))  # type: ignore
 
         if gt_annotation == model_output:
-            scores["exact_match"] += 1
+            scores["exact_match"] += 1  # type: ignore
         elif not model_output and gt_annotation:
-            scores["no_output"] += 1
+            scores["no_output"] += 1  # type: ignore
 
         if gt_annotation and model_output:
             similarity = semantic_similarity(gt_annotation, model_output)
             if similarity >= 0.8:
-                scores[f"{SIM_TITLE}0.8"] += 1
+                scores[f"{SIM_TITLE}0.8"] += 1  # type: ignore
             if similarity >= 0.6:
-                scores[f"{SIM_TITLE}0.6"] += 1
+                scores[f"{SIM_TITLE}0.6"] += 1  # type: ignore
 
     return scores
 

diff --git a/tests/utils/test_scorer.py b/tests/utils/test_scorer.py
@@ -0,0 +1,50 @@
+import numpy as np
+
+from docugami_dfm_benchmarks.utils.scorer import _finalize_scores, score_by_column
+from docugami_dfm_benchmarks.utils.similarity import SIM_TITLE
+
+
+def test_finalize_scores() -> None:
+    scores = {"exact_match": 2, "no_output": 1, "f1_per_row": np.array([1, 0.5, 0.75])}
+    total_rows = 3
+    _finalize_scores(scores, total_rows)
+    assert scores["exact_match"] == 2 / 3
+    assert scores["no_output"] == 1 / 3
+    assert scores["avg_f1"] == np.mean([100, 50, 75])
+
+
+def test_score_by_column() -> None:
+    data = [
+        {
+            "Ground Truth": "Test sentence.",
+            "Model A": "Test sentence.",
+            "Model B": "test sentence",
+        },
+        {
+            "Ground Truth": "Another test.",
+            "Model A": "A different sentence.",
+            "Model B": "",
+        },
+    ]
+    expected_scores = {
+        "Model A": {
+            "avg_f1": 50.0,
+            "exact_match": 0.5,
+            "no_output": 0,
+            f"{SIM_TITLE}0.8": 0.5,
+            f"{SIM_TITLE}0.6": 0.5,
+        },
+        "Model B": {
+            "avg_f1": 50.0,
+            "exact_match": 0.5,
+            "no_output": 0.5,
+            f"{SIM_TITLE}0.8": 0.5,
+            f"{SIM_TITLE}0.6": 0.5,
+        },
+    }
+    scores = score_by_column(data)
+    for column in expected_scores:
+        for metric in expected_scores[column]:
+            assert np.isclose(
+                scores[column][metric], expected_scores[column][metric], atol=0.01
+            )