Key column support for slignment

docugami · Mar 13, 2024 · 844b574 · 844b574
1 parent ecc9f14
commit 844b574
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 54 deletions.
diff --git a/docugami_dfm_benchmarks/cli.py b/docugami_dfm_benchmarks/cli.py
@@ -48,16 +48,20 @@ def eval_by_csv(
     output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN,
 ) -> None:
 
-    with open(ground_truth_csv) as gt_file:
+    with open(ground_truth_csv, encoding="utf-8-sig") as gt_file:
         gt_reader = csv.DictReader(gt_file)
         gt_data = [row for row in gt_reader]
-        with open(model_output_csv) as model_output_file:
+        with open(model_output_csv, encoding="utf-8-sig") as model_output_file:
             model_output_reader = csv.DictReader(model_output_file)
             model_output_data = [row for row in model_output_reader]
 
-            scores, ignored_columns_gt, ignored_columns_model = score_by_separate_csvs(
-                gt_data, model_output_data
-            )
+            (
+                scores,
+                ignored_columns_gt,
+                ignored_columns_model,
+                unmatched_gt,
+                unmatched_mo,
+            ) = score_by_separate_csvs(gt_data, model_output_data, key_column)
             table = tabulate_scores(scores, output_format)
             typer.echo(table)
 
@@ -68,6 +72,14 @@ def eval_by_csv(
                 f"Ignored columns in model output CSV (no match in ground truth): {ignored_columns_model}"
             )
 
+            if key_column:
+                typer.echo(
+                    f"{len(unmatched_gt)} rows in ground truth did not have matching rows in model output (based on key column {key_column})"
+                )
+                typer.echo(
+                    f"{len(unmatched_mo)} rows in model output did not have matching rows in ground truth (based on key column {key_column})"
+                )
+
 
 def _version_callback(value: bool) -> None:
     """
@@ -100,7 +112,12 @@ def main(
 if __name__ == "__main__":
     if sys.gettrace() is not None:
         # debugger attached, modify call below and attach
-        eval_by_column(Path("./temp/CSL-Small.csv"))  # nosec
+        # eval_by_column(Path("./temp/CSL-Small.csv"))  # nosec
+        eval_by_csv(
+            Path("./temp/tangible_ground_truth.csv"),
+            Path("./temp/tangible_model_output.csv"),
+            key_column="COMPLAINT PDF FILE NAME",
+        )
     else:
         # proceed as normal
         app()
diff --git a/docugami_dfm_benchmarks/utils/scorer.py b/docugami_dfm_benchmarks/utils/scorer.py
@@ -33,39 +33,48 @@ def _finalize_scores(scores: dict[str, Any], total_rows: int) -> None:
     scores["avg_f1"] = avg_f1
 
 
-def _compute_scores_for_column(
-    gt_annotations: list[str], model_outputs: list[str]
-) -> dict[str, Any]:
-    """
-    Computes the scores for a single column given lists of ground truth annotations and model outputs.
-    """
-    scores = {
+def _initialize_score_structure() -> dict:
+    """Initializes the structure for storing scores."""
+    return {
         f"{SIM_TITLE}0.8": 0,
         f"{SIM_TITLE}0.6": 0,
         "exact_match": 0,
         "no_output": 0,
         "f1_per_row": [],
     }
 
-    for gt_annotation, model_output in zip(gt_annotations, model_outputs):
-        gt_annotation = normalize(gt_annotation)
-        model_output = normalize(model_output)
 
-        scores["f1_per_row"].append(compute_f1(gt_annotation, model_output))  # type: ignore
+def _update_scores(score_struct: dict, gt_annotation: str, model_output: str) -> None:
+    """
+    Updates the score structure based on a single row's GT and model output, including semantic similarity.
+    """
+    # Normalize the inputs (Normalization may already be done before this call, depending on the flow)
+    gt_annotation = normalize(gt_annotation)
+    model_output = normalize(model_output)
 
-        if gt_annotation == model_output:
-            scores["exact_match"] += 1  # type: ignore
-        elif not model_output and gt_annotation:
-            scores["no_output"] += 1  # type: ignore
+    # Compute F1 score and update
+    score_struct["f1_per_row"].append(compute_f1(gt_annotation, model_output))
 
-        if gt_annotation and model_output:
-            similarity = semantic_similarity(gt_annotation, model_output)
-            if similarity >= 0.8:
-                scores[f"{SIM_TITLE}0.8"] += 1  # type: ignore
-            if similarity >= 0.6:
-                scores[f"{SIM_TITLE}0.6"] += 1  # type: ignore
+    # Check for exact matches
+    if gt_annotation == model_output:
+        score_struct["exact_match"] += 1
+    elif not model_output and gt_annotation:
+        # Consider cases where the model output is empty but there is a GT annotation
+        score_struct["no_output"] += 1
 
-    return scores
+    # Calculate semantic similarity if both GT and model outputs are non-empty
+    if gt_annotation and model_output:
+        similarity = semantic_similarity(gt_annotation, model_output)
+        if similarity >= 0.8:
+            score_struct[f"{SIM_TITLE}0.8"] += 1
+        if similarity >= 0.6:
+            score_struct[f"{SIM_TITLE}0.6"] += 1
+
+
+def _finalize_all_scores(scores: dict, total_matches: int) -> None:
+    """Finalizes all score structures within the scores dict."""
+    for score_struct in scores.values():
+        _finalize_scores(score_struct, total_matches)
 
 
 def score_by_column(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
@@ -98,8 +107,17 @@ def score_by_column(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
     for column in tqdm(model_columns):
         gt_annotations = [normalize(row[KEY_GT]) for row in data]
         model_outputs = [normalize(row[column]) for row in data]
-        column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
+
+        # Initialize the score structure for this column
+        column_scores = _initialize_score_structure()
+
+        # Update scores for each row
+        for gt_annotation, model_output in zip(gt_annotations, model_outputs):
+            _update_scores(column_scores, gt_annotation, model_output)
+
+        # Finalize scores by calculating average F1 and normalizing metrics
         _finalize_scores(column_scores, len(data))
+
         scores[column] = column_scores
 
     return scores
@@ -109,7 +127,7 @@ def score_by_separate_csvs(
     ground_truth_data: list[dict[str, Any]],
     model_output_data: list[dict[str, Any]],
     key_column: Optional[str] = None,
-) -> tuple[dict, set, set]:
+) -> tuple[dict, list[str], list[str], list[str], list[str]]:
     """
     Scores model output against ground truth data when provided in separate CSVs.
     Each CSV should have columns with identical names for comparison. This function
@@ -126,7 +144,6 @@ def score_by_separate_csvs(
     Returns:
     - A dictionary of scores for each common column.
     """
-    # Create mappings for ground truth and model output columns from normalized to original names
     gt_columns_normalized = {normalize(key): key for key in ground_truth_data[0].keys()}
     model_columns_normalized = {
         normalize(key): key for key in model_output_data[0].keys()
@@ -137,29 +154,51 @@ def score_by_separate_csvs(
         model_columns_normalized.keys()
     )
 
-    # Initialize scores dictionary
     scores = {}
-
-    # Prepare sets to track ignored columns based on their original names
     ignored_columns_gt = set(ground_truth_data[0].keys()) - set(
         gt_columns_normalized[norm] for norm in common_columns_normalized
     )
     ignored_columns_model = set(model_output_data[0].keys()) - set(
         model_columns_normalized[norm] for norm in common_columns_normalized
     )
 
-    # Iterate over common columns using the normalized names to facilitate comparison
-    for norm_col in common_columns_normalized:
-        original_gt_col = gt_columns_normalized[norm_col]
-        original_model_col = model_columns_normalized[norm_col]
-
-        gt_annotations = [row[original_gt_col] for row in ground_truth_data]
-        model_outputs = [row[original_model_col] for row in model_output_data]
-
-        column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
-        _finalize_scores(column_scores, len(ground_truth_data))
-        scores[original_gt_col] = (
-            column_scores  # Use the ground truth's original column name
-        )
-
-    return scores, ignored_columns_gt, ignored_columns_model
+    if key_column:
+        gt_keyed_data = {row[key_column]: row for row in ground_truth_data}
+        mo_keyed_data = {row[key_column]: row for row in model_output_data}
+        matched_rows_set = set(gt_keyed_data.keys()).intersection(mo_keyed_data.keys())
+        unmatched_gt = set(gt_keyed_data.keys()) - matched_rows_set
+        unmatched_mo = set(mo_keyed_data.keys()) - matched_rows_set
+        matched_rows = list(
+            matched_rows_set
+        )  # Ensure this is always a list for consistency
+    else:
+        matched_rows = list(range(len(ground_truth_data)))  # type: ignore
+        unmatched_gt = unmatched_mo = set()
+
+    for match in matched_rows:
+        if key_column:
+            gt_row = gt_keyed_data[match]
+            mo_row = mo_keyed_data[match]
+        else:
+            gt_row = ground_truth_data[int(match)]
+            mo_row = model_output_data[int(match)]
+
+        for norm_col in common_columns_normalized:
+            original_gt_col = gt_columns_normalized[norm_col]
+            original_model_col = model_columns_normalized[norm_col]
+            if original_gt_col in gt_row and original_model_col in mo_row:
+                gt_annotation = gt_row[original_gt_col]
+                model_output = mo_row[original_model_col]
+                if original_gt_col not in scores:
+                    scores[original_gt_col] = _initialize_score_structure()
+                _update_scores(scores[original_gt_col], gt_annotation, model_output)
+
+    _finalize_all_scores(scores, len(matched_rows))
+
+    return (
+        scores,
+        sorted(ignored_columns_gt),
+        sorted(ignored_columns_model),
+        sorted(unmatched_gt),
+        sorted(unmatched_mo),
+    )
diff --git a/docugami_dfm_benchmarks/utils/text.py b/docugami_dfm_benchmarks/utils/text.py
@@ -13,6 +13,7 @@ def white_space_fix(text: str) -> str:
         return " ".join(text.split())
 
     def remove_punc(text: str) -> str:
+        text = text.replace("_", " ")
         exclude = set(string.punctuation)
         return "".join(ch for ch in text if ch not in exclude)
 
@@ -21,6 +22,7 @@ def lower(text: str) -> str:
 
     return white_space_fix(remove_articles(remove_punc(lower(text))))
 
+
 def get_tokens(s: str) -> list[str]:
     """Gets normalized tokens from the given string."""
     if not s:

diff --git a/tests/utils/test_scorer.py b/tests/utils/test_scorer.py
@@ -54,7 +54,7 @@ def test_score_by_column() -> None:
             )
 
 
-def test_score_by_separate_csvs() -> None:
+def test_score_by_separate_csvs_aligned() -> None:
     ground_truth_data = [
         {
             "Column1": "Test sentence.",
@@ -95,13 +95,97 @@ def test_score_by_separate_csvs() -> None:
             f"{SIM_TITLE}0.6": 0.5,
         },
     }
-    scores, ignored_columns_gt, ignored_columns_model = score_by_separate_csvs(
-        ground_truth_data, model_output_data
+    scores, ignored_columns_gt, ignored_columns_model, unmatched_gt, unmatched_mo = (
+        score_by_separate_csvs(ground_truth_data, model_output_data)
     )
 
-    assert ignored_columns_gt == {"Unique GT column"}
-    assert ignored_columns_model == {"Unique MO column"}
+    assert ignored_columns_gt == ["Unique GT column"]
+    assert ignored_columns_model == ["Unique MO column"]
 
+    assert not unmatched_gt
+    assert not unmatched_mo
+
+    for column in expected_scores:
+        for metric in expected_scores[column]:
+            assert np.isclose(
+                scores[column][metric], expected_scores[column][metric], atol=0.01
+            ), f"Failed on {column} {metric}: expected {expected_scores[column][metric]}, got {scores[column][metric]}"
+
+
+def test_score_by_separate_csvs_with_key_column() -> None:
+    # Test data includes a key column named "ID" for matching rows across CSVs
+    ground_truth_data = [
+        {
+            "ID": "1",
+            "Column1": "Test sentence.",
+            "Column2": "Another test.",
+            "Unique GT column": "xyz",
+        },
+        {
+            "ID": "2",
+            "Column1": "Second sentence.",
+            "Column2": "Yet another test.",
+            "Unique GT column": "xyz",
+        },
+        {
+            "ID": "3",
+            "Column1": "Third unmatched GT sentence.",
+            "Column2": "Unmatched GT test.",
+            "Unique GT column": "xyz",
+        },
+    ]
+    model_output_data = [
+        {
+            "ID": "2",
+            "Column1": "A different second sentence.",
+            "Column2": "Yet another test.",
+            "Unique MO column": "abc",
+        },
+        {
+            "ID": "1",
+            "Column1": "Test sentence.",
+            "Column2": "",
+            "Unique MO column": "abc",
+        },
+        {
+            "ID": "4",
+            "Column1": "Fourth unmatched MO sentence.",
+            "Column2": "Unmatched MO test.",
+            "Unique MO column": "abc",
+        },
+    ]
+    expected_scores = {
+        "Column1": {
+            "avg_f1": 90.0,  # Considering matched rows only
+            "exact_match": 0.5,
+            "no_output": 0,
+            f"{SIM_TITLE}0.8": 1.0,
+            f"{SIM_TITLE}0.6": 1.0,
+        },
+        "Column2": {
+            "avg_f1": 50.0,  # One exact match, one no_output, considering only matched rows
+            "exact_match": 0.5,
+            "no_output": 0.5,
+            f"{SIM_TITLE}0.8": 0.5,
+            f"{SIM_TITLE}0.6": 0.5,
+        },
+    }
+    key_column = "ID"
+    scores, ignored_columns_gt, ignored_columns_model, unmatched_gt, unmatched_mo = (
+        score_by_separate_csvs(
+            ground_truth_data, model_output_data, key_column=key_column
+        )
+    )
+
+    # Test for ignored columns
+    assert ignored_columns_gt == ["Unique GT column"]
+    assert ignored_columns_model == ["Unique MO column"]
+
+    # Test for mismatched rows
+    assert unmatched_gt == ["3"]
+    assert unmatched_mo == ["4"]
+
+    # Test scores for matched rows
     for column in expected_scores:
         for metric in expected_scores[column]:
             assert np.isclose(

diff --git a/tests/utils/test_text.py b/tests/utils/test_text.py
@@ -24,6 +24,11 @@ def test_normalize_with_extra_whitespace() -> None:
     assert normalize("  This    is  a  test.  ") == "this is test"
 
 
+def test_normalize_with_special() -> None:
+    """Test normalization with special chars."""
+    assert normalize("AMENDMENT_NUMBER") == "amendment number"
+
+
 def test_get_tokens_empty() -> None:
     """Test get_tokens returns an empty list for empty input."""
     assert get_tokens("") == []