added ignored columns handling

docugami · Mar 13, 2024 · ecc9f14 · ecc9f14
1 parent f15cb12
commit ecc9f14
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 23 deletions.
diff --git a/docugami_dfm_benchmarks/cli.py b/docugami_dfm_benchmarks/cli.py
@@ -44,6 +44,7 @@ def eval_by_column(
 def eval_by_csv(
     ground_truth_csv: Path,
     model_output_csv: Path,
+    key_column: Optional[str] = None,
     output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN,
 ) -> None:
 
@@ -54,10 +55,19 @@ def eval_by_csv(
             model_output_reader = csv.DictReader(model_output_file)
             model_output_data = [row for row in model_output_reader]
 
-            scores = score_by_separate_csvs(gt_data, model_output_data)
+            scores, ignored_columns_gt, ignored_columns_model = score_by_separate_csvs(
+                gt_data, model_output_data
+            )
             table = tabulate_scores(scores, output_format)
             typer.echo(table)
 
+            typer.echo(
+                f"Ignored columns in ground truth CSV (no match in model output): {ignored_columns_gt}"
+            )
+            typer.echo(
+                f"Ignored columns in model output CSV (no match in ground truth): {ignored_columns_model}"
+            )
+
 
 def _version_callback(value: bool) -> None:
     """

diff --git a/docugami_dfm_benchmarks/utils/scorer.py b/docugami_dfm_benchmarks/utils/scorer.py
@@ -1,4 +1,4 @@
-from typing import Any
+from typing import Any, Optional
 
 import numpy as np
 from tqdm import tqdm
@@ -106,8 +106,10 @@ def score_by_column(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
 
 
 def score_by_separate_csvs(
-    ground_truth_data: list[dict[str, Any]], model_output_data: list[dict[str, Any]]
-) -> dict:
+    ground_truth_data: list[dict[str, Any]],
+    model_output_data: list[dict[str, Any]],
+    key_column: Optional[str] = None,
+) -> tuple[dict, set, set]:
     """
     Scores model output against ground truth data when provided in separate CSVs.
     Each CSV should have columns with identical names for comparison. This function
@@ -124,22 +126,40 @@ def score_by_separate_csvs(
     Returns:
     - A dictionary of scores for each common column.
     """
-    gt_columns = set(ground_truth_data[0].keys())
-    model_columns = set(model_output_data[0].keys())
-    common_columns = gt_columns.intersection(model_columns)
-    ignored_columns = (gt_columns.union(model_columns)) - common_columns
-
-    if ignored_columns:
-        print(
-            f"Warning: Ignoring columns without matches in both CSVs: {ignored_columns}"
-        )
+    # Create mappings for ground truth and model output columns from normalized to original names
+    gt_columns_normalized = {normalize(key): key for key in ground_truth_data[0].keys()}
+    model_columns_normalized = {
+        normalize(key): key for key in model_output_data[0].keys()
+    }
+
+    # Identify common columns based on normalized names and keep track of the original names for later use
+    common_columns_normalized = set(gt_columns_normalized.keys()).intersection(
+        model_columns_normalized.keys()
+    )
 
+    # Initialize scores dictionary
     scores = {}
-    for column in tqdm(common_columns):
-        gt_annotations = [row[column] for row in ground_truth_data]
-        model_outputs = [row[column] for row in model_output_data]
+
+    # Prepare sets to track ignored columns based on their original names
+    ignored_columns_gt = set(ground_truth_data[0].keys()) - set(
+        gt_columns_normalized[norm] for norm in common_columns_normalized
+    )
+    ignored_columns_model = set(model_output_data[0].keys()) - set(
+        model_columns_normalized[norm] for norm in common_columns_normalized
+    )
+
+    # Iterate over common columns using the normalized names to facilitate comparison
+    for norm_col in common_columns_normalized:
+        original_gt_col = gt_columns_normalized[norm_col]
+        original_model_col = model_columns_normalized[norm_col]
+
+        gt_annotations = [row[original_gt_col] for row in ground_truth_data]
+        model_outputs = [row[original_model_col] for row in model_output_data]
+
         column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
         _finalize_scores(column_scores, len(ground_truth_data))
-        scores[column] = column_scores
+        scores[original_gt_col] = (
+            column_scores  # Use the ground truth's original column name
+        )
 
-    return scores
+    return scores, ignored_columns_gt, ignored_columns_model
diff --git a/tests/utils/test_scorer.py b/tests/utils/test_scorer.py
@@ -56,12 +56,28 @@ def test_score_by_column() -> None:
 
 def test_score_by_separate_csvs() -> None:
     ground_truth_data = [
-        {"Column1": "Test sentence.", "Column2": "Another test."},
-        {"Column1": "Second sentence.", "Column2": "Yet another test."},
+        {
+            "Column1": "Test sentence.",
+            "Column2": "Another test.",
+            "Unique GT column": "xyz",
+        },
+        {
+            "Column1": "Second sentence.",
+            "Column2": "Yet another test.",
+            "Unique GT column": "xyz",
+        },
     ]
     model_output_data = [
-        {"Column1": "Test sentence.", "Column2": ""},
-        {"Column1": "A different second sentence.", "Column2": "Yet another test."},
+        {
+            "Column1": "Test sentence.",
+            "Column2": "",
+            "Unique MO column": "abc",
+        },
+        {
+            "Column1": "A different second sentence.",
+            "Column2": "Yet another test.",
+            "Unique MO column": "abc",
+        },
     ]
     expected_scores = {
         "Column1": {
@@ -79,7 +95,12 @@ def test_score_by_separate_csvs() -> None:
             f"{SIM_TITLE}0.6": 0.5,
         },
     }
-    scores = score_by_separate_csvs(ground_truth_data, model_output_data)
+    scores, ignored_columns_gt, ignored_columns_model = score_by_separate_csvs(
+        ground_truth_data, model_output_data
+    )
+
+    assert ignored_columns_gt == {"Unique GT column"}
+    assert ignored_columns_model == {"Unique MO column"}
 
     for column in expected_scores:
         for metric in expected_scores[column]: