Skip to content


refactor common code
Browse files Browse the repository at this point in the history
  • Loading branch information
Taqi Jaffri committed Mar 13, 2024
1 parent 0066370 commit 8c431ad
Show file tree
Hide file tree
Showing 5 changed files with 198 additions and 98 deletions.
2 changes: 1 addition & 1 deletion
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ Make sure you have [poetry]( installed on your m
The eval datasets, including DFM output labels, can be found under `data/`. To evaluate your own model, please add a new column to the CSV to the *right* of the _Ground Truth_ column, with your label for each row. Obviously, please don't train on any of the data in the eval dataset to avoid overfitting. Then, just run:

poetry run benchmark eval /path/to/data.csv
poetry run benchmark eval-by-column /path/to/data.csv

This should output results for the data in the benchmark, in tabular format. See current results section below for some examples for different benchmarks.
Expand Down
43 changes: 38 additions & 5 deletions docugami_dfm_benchmarks/
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,60 @@

import typer

from docugami_dfm_benchmarks.utils.scorer import OutputFormat, score_data, tabulate_scores
from docugami_dfm_benchmarks.utils.scorer import score_by_column, score_by_separate_csvs
from docugami_dfm_benchmarks.utils.tabulation import OutputFormat, tabulate_scores

app = typer.Typer(
help="Benchmarks for Business Document Foundation Models",
help="Docugami Foundation Model (DFM) Benchmark evaluation scripts",

def eval(
def eval_by_column(
csv_file: Path,
output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN,
) -> None:
Scores the data in the given input CSV file. Assumes data is in the following format:
data_col_1 | data_col_2 | ... | data_col_n | Ground Truth | model_col_1 | ... | model_col_n
data_x | data_y | ... | data_z | label_x | label_y | ... | label_z
Ignores the data_col_* values, and looks at the columns to the right of Ground Truth.
Scores all the model_col_* values to the right of the Ground Truth column against the
Ground Truth column using a few different metrics.
with open(csv_file) as file:
reader = csv.DictReader(file)
data = [row for row in reader]
scores = score_data(data)
scores = score_by_column(data)
table = tabulate_scores(scores, output_format)

def eval_by_csv(
ground_truth_csv: Path,
model_output_csv: Path,
output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN,
) -> None:

with open(ground_truth_csv) as gt_file:
gt_reader = csv.DictReader(gt_file)
gt_data = [row for row in gt_reader]
with open(model_output_csv) as model_output_file:
model_output_reader = csv.DictReader(model_output_file)
model_output_data = [row for row in model_output_reader]

scores = score_by_separate_csvs(gt_data, model_output_data)
table = tabulate_scores(scores, output_format)

def _version_callback(value: bool) -> None:
Gets the current version number from the Poetry package.
Expand Down Expand Up @@ -57,7 +90,7 @@ def main(
if __name__ == "__main__":
if sys.gettrace() is not None:
# debugger attached, modify call below and attach
eval(Path("./temp/CSL-Small.csv")) # nosec
eval_by_column(Path("./temp/CSL-Small.csv")) # nosec
# proceed as normal
202 changes: 113 additions & 89 deletions docugami_dfm_benchmarks/utils/
Original file line number Diff line number Diff line change
@@ -1,119 +1,143 @@
from enum import Enum
from typing import Any

import numpy as np
from tabulate import tabulate
from tqdm import tqdm

from docugami_dfm_benchmarks.utils.similarity import compute_f1, semantic_similarity
from docugami_dfm_benchmarks.utils.similarity import (
from docugami_dfm_benchmarks.utils.text import normalize

KEY_GT = "Ground Truth"
sim_title = "Similarity@>="

class OutputFormat(str, Enum):
TSV = "tsv"
def _finalize_scores(scores, total_rows):
Normalizes scores by the total number of rows and calculates the average F1 score.
- scores: The scores dictionary for a single column.
- total_rows: The total number of rows over which scores were computed.
Modifies the scores dictionary in-place to include normalized metrics and the average F1 score.
avg_f1 = 0
for metric in list(scores):
if metric != "f1_per_row":
scores[metric] /= total_rows
avg_f1 = np.mean(scores[metric]) * 100

scores["avg_f1"] = avg_f1

def score_data(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
def _compute_scores_for_column(gt_annotations, model_outputs):
Computes the scores for a single column given lists of ground truth annotations and model outputs.
Scores the data in the given input. Assumes data is in the following format:
scores = {
f"{SIM_TITLE}0.8": 0,
f"{SIM_TITLE}0.6": 0,
"exact_match": 0,
"no_output": 0,
"f1_per_row": [],

for gt_annotation, model_output in zip(gt_annotations, model_outputs):
gt_annotation = normalize(gt_annotation)
model_output = normalize(model_output)

scores["f1_per_row"].append(compute_f1(gt_annotation, model_output))

data_col_1 | data_col_2 | ... | data_col_n | Ground Truth | model_col_1 | ... | model_col_n
data_x | data_y | ... | data_z | label_x | label_y | ... | label_z
if gt_annotation == model_output:
scores["exact_match"] += 1
elif not model_output and gt_annotation:
scores["no_output"] += 1

if gt_annotation and model_output:
similarity = semantic_similarity(gt_annotation, model_output)
if similarity >= 0.8:
scores[f"{SIM_TITLE}0.8"] += 1
if similarity >= 0.6:
scores[f"{SIM_TITLE}0.6"] += 1

return scores

Ignores the data_col_* values, and looks at the columns to the right of Ground Truth.

Scores all the model_col_* values to the right of the Ground Truth column against the
Ground Truth column using a few different metrics.
def score_by_column(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
column_headers = list(data[0].keys())
Scores the data provided in a single CSV, comparing model outputs directly against
a ground truth column. Assumes a specific CSV format where one column specifies the
ground truth, and all subsequent columns are model outputs to be scored against this ground truth.
- data: List of dictionaries representing rows from the CSV. Each dictionary corresponds to a row,
with keys as column headers.
- A dictionary of scores for each model output column, including metrics such as similarity thresholds,
exact match, no output, and average F1 score.
data_columns = list(data[0].keys())

gt_col_index = column_headers.index(KEY_GT)
gt_col_index = data_columns.index(KEY_GT)
except ValueError:
raise Exception(
f"Ground truth annotation column not found, expected {KEY_GT} in list {column_headers}"
f"Ground truth annotation column not found, expected {KEY_GT} in list {data_columns}"

# all columns to the right of the GT column are models
ai_model_headers = column_headers[gt_col_index + 1 :]
scores: dict[str, dict[str, Any]] = {
model: {
f"{sim_title}0.8": 0,
f"{sim_title}0.6": 0,
"exact_match": 0,
"no_output": 0,
"f1_per_row": [],
for model in ai_model_headers
# all columns to the right of the GT column are considered models
model_columns = data_columns[gt_col_index + 1 :]
scores = {}

for row in tqdm(data):
gt_annotation = normalize(row[KEY_GT])
for model in ai_model_headers:
model_output = normalize(row[model])

# Token F1 for this row
scores[model]["f1_per_row"].append(compute_f1(gt_annotation, model_output))

if gt_annotation == model_output:
# Exact match
scores[model]["exact_match"] += 1
elif not model_output and gt_annotation:
# Model output is empty, but ground truth annotation is not
scores[model]["no_output"] += 1

if gt_annotation and model_output:
# Semantic similarity at different thresholds
similarity = semantic_similarity(gt_annotation, model_output)
if similarity >= 0.8:
scores[model][f"{sim_title}0.8"] += 1
if similarity >= 0.6:
scores[model][f"{sim_title}0.6"] += 1

total_rows = len(data)

for model in ai_model_headers:
scores[model][f"{sim_title}0.8"] = scores[model][f"{sim_title}0.8"] / total_rows
scores[model][f"{sim_title}0.6"] = scores[model][f"{sim_title}0.6"] / total_rows
scores[model]["exact_match"] = scores[model]["exact_match"] / total_rows
scores[model]["no_output"] = scores[model]["no_output"] / total_rows
scores[model]["avg_f1"] = np.mean(scores[model]["f1_per_row"]) * 100
for column in tqdm(model_columns):
gt_annotations = [normalize(row[KEY_GT]) for row in data]
model_outputs = [normalize(row[column]) for row in data]
column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
_finalize_scores(column_scores, len(data))
scores[column] = column_scores

return scores

def tabulate_scores(
scores: dict, output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN
) -> str:
"""Tabulates a set of scores (output of the score() function) into a printable view"""
headers = [
"Exact Match",
f"{sim_title} 0.8",
f"{sim_title} 0.6",
"Average F1",
"No Output",
table = []

for model, metrics in scores.items():
def score_by_separate_csvs(
ground_truth_data: list[dict[str, Any]], model_output_data: list[dict[str, Any]]
) -> dict:
Scores model output against ground truth data when provided in separate CSVs.
Each CSV should have columns with identical names for comparison. This function
computes scores on a per-column basis for all common columns found in both CSVs.
Assumes that each row in the ground truth CSV corresponds to the same row in the
model output CSV. Columns not present in both CSVs are ignored, and a warning
is logged.
- ground_truth_data: List of dictionaries representing rows from the ground truth CSV.
- model_output_data: List of dictionaries representing rows from the model output CSV.
- A dictionary of scores for each common column.
gt_columns = set(ground_truth_data[0].keys())
model_columns = set(model_output_data[0].keys())
common_columns = gt_columns.intersection(model_columns)
ignored_columns = (gt_columns.union(model_columns)) - common_columns

if ignored_columns:
f"Warning: Ignoring columns without matches in both CSVs: {ignored_columns}"

return tabulate(
table, headers=headers, floatfmt=".2f", tablefmt=output_format.value
scores = {}
for column in tqdm(common_columns):
gt_annotations = [row[column] for row in ground_truth_data]
model_outputs = [row[column] for row in model_output_data]
column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
_finalize_scores(column_scores, len(ground_truth_data))
scores[column] = column_scores

return scores
8 changes: 5 additions & 3 deletions docugami_dfm_benchmarks/utils/
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,15 @@

from docugami_dfm_benchmarks.utils.text import get_tokens, normalize

embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
SIM_TITLE = "Similarity@>="

_embedding_model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

def semantic_similarity(text1: str, text2: str) -> Number:
"""Compute semantic similarity (cosine) between embeddings of given texts."""
embedding_1 = embedding_model.encode(text1, convert_to_tensor=True)
embedding_2 = embedding_model.encode(text2, convert_to_tensor=True)
embedding_1 = _embedding_model.encode(text1, convert_to_tensor=True)
embedding_2 = _embedding_model.encode(text2, convert_to_tensor=True)
return util.pytorch_cos_sim(embedding_1, embedding_2).item()

Expand Down
41 changes: 41 additions & 0 deletions docugami_dfm_benchmarks/utils/
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from enum import Enum

from tabulate import tabulate

from docugami_dfm_benchmarks.utils.similarity import SIM_TITLE

class OutputFormat(str, Enum):
TSV = "tsv"

def tabulate_scores(
scores: dict, output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN
) -> str:
"""Tabulates a set of scores (output of the score() function) into a printable view"""
headers = [
"Exact Match",
f"{SIM_TITLE} 0.8",
f"{SIM_TITLE} 0.6",
"Average F1",
"No Output",
table = []

for model, metrics in scores.items():

return tabulate(
table, headers=headers, floatfmt=".2f", tablefmt=output_format.value

0 comments on commit 8c431ad

Please sign in to comment.