Merge pull request #15 from docugami/tjaffri/diff

Tjaffri/diff
docugami · Mar 13, 2024 · f15cb12 · f15cb12
2 parents 0066370 + c5e3351
commit f15cb12
Show file tree

Hide file tree

Showing 7 changed files with 289 additions and 99 deletions.
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -3,7 +3,7 @@
         "tests",
         "--doctest-modules",
         "tests",
-        "docugami"
+        "docugami_dfm_benchmarks"
     ],
     "python.testing.unittestEnabled": false,
     "python.testing.pytestEnabled": true

diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Make sure you have [poetry](https://python-poetry.org/docs/) installed on your m
 The eval datasets, including DFM output labels, can be found under `data/`. To evaluate your own model, please add a new column to the CSV to the *right* of the _Ground Truth_ column, with your label for each row. Obviously, please don't train on any of the data in the eval dataset to avoid overfitting. Then, just run:
 
 ``
-poetry run benchmark eval /path/to/data.csv
+poetry run benchmark eval-by-column /path/to/data.csv
 ``
 
 This should output results for the data in the benchmark, in tabular format. See current results section below for some examples for different benchmarks.

diff --git a/docugami_dfm_benchmarks/cli.py b/docugami_dfm_benchmarks/cli.py
@@ -5,27 +5,60 @@
 
 import typer
 
-from docugami_dfm_benchmarks.utils.scorer import OutputFormat, score_data, tabulate_scores
+from docugami_dfm_benchmarks.utils.scorer import score_by_column, score_by_separate_csvs
+from docugami_dfm_benchmarks.utils.tabulation import OutputFormat, tabulate_scores
 
 app = typer.Typer(
-    help="Benchmarks for Business Document Foundation Models",
+    help="Docugami Foundation Model (DFM) Benchmark evaluation scripts",
     no_args_is_help=True,
 )
 
 
 @app.command()
-def eval(
+def eval_by_column(
     csv_file: Path,
     output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN,
 ) -> None:
+    """
+    Scores the data in the given input CSV file. Assumes data is in the following format:
+
+    data_col_1 | data_col_2 | ... | data_col_n | Ground Truth   | model_col_1 | ... | model_col_n
+    -----------|------------|-----|------------|----------------|-------------|-----|------------
+    data_x     |  data_y    | ... |  data_z    | label_x        | label_y     | ... | label_z
+    ...
+
+    Ignores the data_col_* values, and looks at the columns to the right of Ground Truth.
+
+    Scores all the model_col_* values to the right of the Ground Truth column against the
+    Ground Truth column using a few different metrics.
+    """
     with open(csv_file) as file:
         reader = csv.DictReader(file)
         data = [row for row in reader]
-        scores = score_data(data)
+        scores = score_by_column(data)
         table = tabulate_scores(scores, output_format)
         typer.echo(table)
 
 
+@app.command()
+def eval_by_csv(
+    ground_truth_csv: Path,
+    model_output_csv: Path,
+    output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN,
+) -> None:
+
+    with open(ground_truth_csv) as gt_file:
+        gt_reader = csv.DictReader(gt_file)
+        gt_data = [row for row in gt_reader]
+        with open(model_output_csv) as model_output_file:
+            model_output_reader = csv.DictReader(model_output_file)
+            model_output_data = [row for row in model_output_reader]
+
+            scores = score_by_separate_csvs(gt_data, model_output_data)
+            table = tabulate_scores(scores, output_format)
+            typer.echo(table)
+
+
 def _version_callback(value: bool) -> None:
     """
     Gets the current version number from the Poetry package.
@@ -57,7 +90,7 @@ def main(
 if __name__ == "__main__":
     if sys.gettrace() is not None:
         # debugger attached, modify call below and attach
-        eval(Path("./temp/CSL-Small.csv"))  # nosec
+        eval_by_column(Path("./temp/CSL-Small.csv"))  # nosec
     else:
         # proceed as normal
         app()
diff --git a/docugami_dfm_benchmarks/utils/scorer.py b/docugami_dfm_benchmarks/utils/scorer.py
@@ -1,119 +1,145 @@
-from enum import Enum
 from typing import Any
 
 import numpy as np
-from tabulate import tabulate
 from tqdm import tqdm
 
-from docugami_dfm_benchmarks.utils.similarity import compute_f1, semantic_similarity
+from docugami_dfm_benchmarks.utils.similarity import (
+    SIM_TITLE,
+    compute_f1,
+    semantic_similarity,
+)
 from docugami_dfm_benchmarks.utils.text import normalize
 
 KEY_GT = "Ground Truth"
-sim_title = "Similarity@>="
 
 
-class OutputFormat(str, Enum):
-    TSV = "tsv"
-    GITHUB_MARKDOWN = "github"
+def _finalize_scores(scores: dict[str, Any], total_rows: int) -> None:
+    """
+    Normalizes scores by the total number of rows and calculates the average F1 score.
+
+    Parameters:
+    - scores: The scores dictionary for a single column.
+    - total_rows: The total number of rows over which scores were computed.
+
+    Modifies the scores dictionary in-place to include normalized metrics and the average F1 score.
+    """
+    avg_f1 = 0
+    for metric in list(scores):
+        if metric == "f1_per_row":
+            avg_f1 = np.mean(scores[metric]) * 100
+        else:
+            scores[metric] /= total_rows
+
+    scores["avg_f1"] = avg_f1
 
 
-def score_data(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
+def _compute_scores_for_column(
+    gt_annotations: list[str], model_outputs: list[str]
+) -> dict[str, Any]:
+    """
+    Computes the scores for a single column given lists of ground truth annotations and model outputs.
     """
-    Scores the data in the given input. Assumes data is in the following format:
+    scores = {
+        f"{SIM_TITLE}0.8": 0,
+        f"{SIM_TITLE}0.6": 0,
+        "exact_match": 0,
+        "no_output": 0,
+        "f1_per_row": [],
+    }
+
+    for gt_annotation, model_output in zip(gt_annotations, model_outputs):
+        gt_annotation = normalize(gt_annotation)
+        model_output = normalize(model_output)
+
+        scores["f1_per_row"].append(compute_f1(gt_annotation, model_output))  # type: ignore
 
-    data_col_1 | data_col_2 | ... | data_col_n | Ground Truth   | model_col_1 | ... | model_col_n
-    -----------|------------|-----|------------|----------------|-------------|-----|------------
-    data_x     |  data_y    | ... |  data_z    | label_x        | label_y     | ... | label_z
-    ...
+        if gt_annotation == model_output:
+            scores["exact_match"] += 1  # type: ignore
+        elif not model_output and gt_annotation:
+            scores["no_output"] += 1  # type: ignore
+
+        if gt_annotation and model_output:
+            similarity = semantic_similarity(gt_annotation, model_output)
+            if similarity >= 0.8:
+                scores[f"{SIM_TITLE}0.8"] += 1  # type: ignore
+            if similarity >= 0.6:
+                scores[f"{SIM_TITLE}0.6"] += 1  # type: ignore
+
+    return scores
 
-    Ignores the data_col_* values, and looks at the columns to the right of Ground Truth.
 
-    Scores all the model_col_* values to the right of the Ground Truth column against the
-    Ground Truth column using a few different metrics.
+def score_by_column(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
     """
-    column_headers = list(data[0].keys())
+    Scores the data provided in a single CSV, comparing model outputs directly against
+    a ground truth column. Assumes a specific CSV format where one column specifies the
+    ground truth, and all subsequent columns are model outputs to be scored against this ground truth.
+
+    Parameters:
+    - data: List of dictionaries representing rows from the CSV. Each dictionary corresponds to a row,
+            with keys as column headers.
+
+    Returns:
+    - A dictionary of scores for each model output column, including metrics such as similarity thresholds,
+      exact match, no output, and average F1 score.
+    """
+    data_columns = list(data[0].keys())
 
     try:
-        gt_col_index = column_headers.index(KEY_GT)
+        gt_col_index = data_columns.index(KEY_GT)
     except ValueError:
         raise Exception(
-            f"Ground truth annotation column not found, expected {KEY_GT} in list {column_headers}"
+            f"Ground truth annotation column not found, expected {KEY_GT} in list {data_columns}"
         )
 
-    # all columns to the right of the GT column are models
-    ai_model_headers = column_headers[gt_col_index + 1 :]
-    scores: dict[str, dict[str, Any]] = {
-        model: {
-            f"{sim_title}0.8": 0,
-            f"{sim_title}0.6": 0,
-            "exact_match": 0,
-            "no_output": 0,
-            "f1_per_row": [],
-        }
-        for model in ai_model_headers
-    }
+    # all columns to the right of the GT column are considered models
+    model_columns = data_columns[gt_col_index + 1 :]
+    scores = {}
 
-    for row in tqdm(data):
-        gt_annotation = normalize(row[KEY_GT])
-        for model in ai_model_headers:
-            model_output = normalize(row[model])
-
-            # Token F1 for this row
-            scores[model]["f1_per_row"].append(compute_f1(gt_annotation, model_output))
-
-            if gt_annotation == model_output:
-                # Exact match
-                scores[model]["exact_match"] += 1
-            elif not model_output and gt_annotation:
-                # Model output is empty, but ground truth annotation is not
-                scores[model]["no_output"] += 1
-
-            if gt_annotation and model_output:
-                # Semantic similarity at different thresholds
-                similarity = semantic_similarity(gt_annotation, model_output)
-                if similarity >= 0.8:
-                    scores[model][f"{sim_title}0.8"] += 1
-                if similarity >= 0.6:
-                    scores[model][f"{sim_title}0.6"] += 1
-
-    total_rows = len(data)
-
-    for model in ai_model_headers:
-        scores[model][f"{sim_title}0.8"] = scores[model][f"{sim_title}0.8"] / total_rows
-        scores[model][f"{sim_title}0.6"] = scores[model][f"{sim_title}0.6"] / total_rows
-        scores[model]["exact_match"] = scores[model]["exact_match"] / total_rows
-        scores[model]["no_output"] = scores[model]["no_output"] / total_rows
-        scores[model]["avg_f1"] = np.mean(scores[model]["f1_per_row"]) * 100
+    for column in tqdm(model_columns):
+        gt_annotations = [normalize(row[KEY_GT]) for row in data]
+        model_outputs = [normalize(row[column]) for row in data]
+        column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
+        _finalize_scores(column_scores, len(data))
+        scores[column] = column_scores
 
     return scores
 
 
-def tabulate_scores(
-    scores: dict, output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN
-) -> str:
-    """Tabulates a set of scores (output of the score() function) into a printable view"""
-    headers = [
-        "Model",
-        "Exact Match",
-        f"{sim_title} 0.8",
-        f"{sim_title} 0.6",
-        "Average F1",
-        "No Output",
-    ]
-    table = []
-
-    for model, metrics in scores.items():
-        table.append(
-            [
-                model,
-                metrics["exact_match"],
-                metrics[f"{sim_title}0.8"],
-                metrics[f"{sim_title}0.6"],
-                metrics["avg_f1"],
-                metrics["no_output"],
-            ]
+def score_by_separate_csvs(
+    ground_truth_data: list[dict[str, Any]], model_output_data: list[dict[str, Any]]
+) -> dict:
+    """
+    Scores model output against ground truth data when provided in separate CSVs.
+    Each CSV should have columns with identical names for comparison. This function
+    computes scores on a per-column basis for all common columns found in both CSVs.
+
+    Assumes that each row in the ground truth CSV corresponds to the same row in the
+    model output CSV. Columns not present in both CSVs are ignored, and a warning
+    is logged.
+
+    Parameters:
+    - ground_truth_data: List of dictionaries representing rows from the ground truth CSV.
+    - model_output_data: List of dictionaries representing rows from the model output CSV.
+
+    Returns:
+    - A dictionary of scores for each common column.
+    """
+    gt_columns = set(ground_truth_data[0].keys())
+    model_columns = set(model_output_data[0].keys())
+    common_columns = gt_columns.intersection(model_columns)
+    ignored_columns = (gt_columns.union(model_columns)) - common_columns
+
+    if ignored_columns:
+        print(
+            f"Warning: Ignoring columns without matches in both CSVs: {ignored_columns}"
         )
 
-    return tabulate(
-        table, headers=headers, floatfmt=".2f", tablefmt=output_format.value
-    )
+    scores = {}
+    for column in tqdm(common_columns):
+        gt_annotations = [row[column] for row in ground_truth_data]
+        model_outputs = [row[column] for row in model_output_data]
+        column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
+        _finalize_scores(column_scores, len(ground_truth_data))
+        scores[column] = column_scores
+
+    return scores
diff --git a/docugami_dfm_benchmarks/utils/similarity.py b/docugami_dfm_benchmarks/utils/similarity.py
@@ -5,13 +5,15 @@
 
 from docugami_dfm_benchmarks.utils.text import get_tokens, normalize
 
-embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
+SIM_TITLE = "Similarity@>="
+
+_embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
 
 
 def semantic_similarity(text1: str, text2: str) -> Number:
     """Compute semantic similarity (cosine) between embeddings of given texts."""
-    embedding_1 = embedding_model.encode(text1, convert_to_tensor=True)
-    embedding_2 = embedding_model.encode(text2, convert_to_tensor=True)
+    embedding_1 = _embedding_model.encode(text1, convert_to_tensor=True)
+    embedding_2 = _embedding_model.encode(text2, convert_to_tensor=True)
     return util.pytorch_cos_sim(embedding_1, embedding_2).item()
 
 

diff --git a/docugami_dfm_benchmarks/utils/tabulation.py b/docugami_dfm_benchmarks/utils/tabulation.py
@@ -0,0 +1,41 @@
+from enum import Enum
+
+from tabulate import tabulate
+
+from docugami_dfm_benchmarks.utils.similarity import SIM_TITLE
+
+
+class OutputFormat(str, Enum):
+    TSV = "tsv"
+    GITHUB_MARKDOWN = "github"
+
+
+def tabulate_scores(
+    scores: dict, output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN
+) -> str:
+    """Tabulates a set of scores (output of the score() function) into a printable view"""
+    headers = [
+        "Model",
+        "Exact Match",
+        f"{SIM_TITLE} 0.8",
+        f"{SIM_TITLE} 0.6",
+        "Average F1",
+        "No Output",
+    ]
+    table = []
+
+    for model, metrics in scores.items():
+        table.append(
+            [
+                model,
+                metrics["exact_match"],
+                metrics[f"{SIM_TITLE}0.8"],
+                metrics[f"{SIM_TITLE}0.6"],
+                metrics["avg_f1"],
+                metrics["no_output"],
+            ]
+        )
+
+    return tabulate(
+        table, headers=headers, floatfmt=".2f", tablefmt=output_format.value
+    )