Skip to content

Commit

Permalink
Key column support for slignment
Browse files Browse the repository at this point in the history
  • Loading branch information
Taqi Jaffri committed Mar 13, 2024
1 parent ecc9f14 commit 844b574
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 54 deletions.
29 changes: 23 additions & 6 deletions docugami_dfm_benchmarks/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,16 +48,20 @@ def eval_by_csv(
output_format: OutputFormat = OutputFormat.GITHUB_MARKDOWN,
) -> None:

with open(ground_truth_csv) as gt_file:
with open(ground_truth_csv, encoding="utf-8-sig") as gt_file:
gt_reader = csv.DictReader(gt_file)
gt_data = [row for row in gt_reader]
with open(model_output_csv) as model_output_file:
with open(model_output_csv, encoding="utf-8-sig") as model_output_file:
model_output_reader = csv.DictReader(model_output_file)
model_output_data = [row for row in model_output_reader]

scores, ignored_columns_gt, ignored_columns_model = score_by_separate_csvs(
gt_data, model_output_data
)
(
scores,
ignored_columns_gt,
ignored_columns_model,
unmatched_gt,
unmatched_mo,
) = score_by_separate_csvs(gt_data, model_output_data, key_column)
table = tabulate_scores(scores, output_format)
typer.echo(table)

Expand All @@ -68,6 +72,14 @@ def eval_by_csv(
f"Ignored columns in model output CSV (no match in ground truth): {ignored_columns_model}"
)

if key_column:
typer.echo(
f"{len(unmatched_gt)} rows in ground truth did not have matching rows in model output (based on key column {key_column})"
)
typer.echo(
f"{len(unmatched_mo)} rows in model output did not have matching rows in ground truth (based on key column {key_column})"
)


def _version_callback(value: bool) -> None:
"""
Expand Down Expand Up @@ -100,7 +112,12 @@ def main(
if __name__ == "__main__":
if sys.gettrace() is not None:
# debugger attached, modify call below and attach
eval_by_column(Path("./temp/CSL-Small.csv")) # nosec
# eval_by_column(Path("./temp/CSL-Small.csv")) # nosec
eval_by_csv(
Path("./temp/tangible_ground_truth.csv"),
Path("./temp/tangible_model_output.csv"),
key_column="COMPLAINT PDF FILE NAME",
)
else:
# proceed as normal
app()
125 changes: 82 additions & 43 deletions docugami_dfm_benchmarks/utils/scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,39 +33,48 @@ def _finalize_scores(scores: dict[str, Any], total_rows: int) -> None:
scores["avg_f1"] = avg_f1


def _compute_scores_for_column(
gt_annotations: list[str], model_outputs: list[str]
) -> dict[str, Any]:
"""
Computes the scores for a single column given lists of ground truth annotations and model outputs.
"""
scores = {
def _initialize_score_structure() -> dict:
"""Initializes the structure for storing scores."""
return {
f"{SIM_TITLE}0.8": 0,
f"{SIM_TITLE}0.6": 0,
"exact_match": 0,
"no_output": 0,
"f1_per_row": [],
}

for gt_annotation, model_output in zip(gt_annotations, model_outputs):
gt_annotation = normalize(gt_annotation)
model_output = normalize(model_output)

scores["f1_per_row"].append(compute_f1(gt_annotation, model_output)) # type: ignore
def _update_scores(score_struct: dict, gt_annotation: str, model_output: str) -> None:
"""
Updates the score structure based on a single row's GT and model output, including semantic similarity.
"""
# Normalize the inputs (Normalization may already be done before this call, depending on the flow)
gt_annotation = normalize(gt_annotation)
model_output = normalize(model_output)

if gt_annotation == model_output:
scores["exact_match"] += 1 # type: ignore
elif not model_output and gt_annotation:
scores["no_output"] += 1 # type: ignore
# Compute F1 score and update
score_struct["f1_per_row"].append(compute_f1(gt_annotation, model_output))

if gt_annotation and model_output:
similarity = semantic_similarity(gt_annotation, model_output)
if similarity >= 0.8:
scores[f"{SIM_TITLE}0.8"] += 1 # type: ignore
if similarity >= 0.6:
scores[f"{SIM_TITLE}0.6"] += 1 # type: ignore
# Check for exact matches
if gt_annotation == model_output:
score_struct["exact_match"] += 1
elif not model_output and gt_annotation:
# Consider cases where the model output is empty but there is a GT annotation
score_struct["no_output"] += 1

return scores
# Calculate semantic similarity if both GT and model outputs are non-empty
if gt_annotation and model_output:
similarity = semantic_similarity(gt_annotation, model_output)
if similarity >= 0.8:
score_struct[f"{SIM_TITLE}0.8"] += 1
if similarity >= 0.6:
score_struct[f"{SIM_TITLE}0.6"] += 1


def _finalize_all_scores(scores: dict, total_matches: int) -> None:
"""Finalizes all score structures within the scores dict."""
for score_struct in scores.values():
_finalize_scores(score_struct, total_matches)


def score_by_column(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
Expand Down Expand Up @@ -98,8 +107,17 @@ def score_by_column(data: list[dict[str, Any]]) -> dict[str, dict[str, Any]]:
for column in tqdm(model_columns):
gt_annotations = [normalize(row[KEY_GT]) for row in data]
model_outputs = [normalize(row[column]) for row in data]
column_scores = _compute_scores_for_column(gt_annotations, model_outputs)

# Initialize the score structure for this column
column_scores = _initialize_score_structure()

# Update scores for each row
for gt_annotation, model_output in zip(gt_annotations, model_outputs):
_update_scores(column_scores, gt_annotation, model_output)

# Finalize scores by calculating average F1 and normalizing metrics
_finalize_scores(column_scores, len(data))

scores[column] = column_scores

return scores
Expand All @@ -109,7 +127,7 @@ def score_by_separate_csvs(
ground_truth_data: list[dict[str, Any]],
model_output_data: list[dict[str, Any]],
key_column: Optional[str] = None,
) -> tuple[dict, set, set]:
) -> tuple[dict, list[str], list[str], list[str], list[str]]:
"""
Scores model output against ground truth data when provided in separate CSVs.
Each CSV should have columns with identical names for comparison. This function
Expand All @@ -126,7 +144,6 @@ def score_by_separate_csvs(
Returns:
- A dictionary of scores for each common column.
"""
# Create mappings for ground truth and model output columns from normalized to original names
gt_columns_normalized = {normalize(key): key for key in ground_truth_data[0].keys()}
model_columns_normalized = {
normalize(key): key for key in model_output_data[0].keys()
Expand All @@ -137,29 +154,51 @@ def score_by_separate_csvs(
model_columns_normalized.keys()
)

# Initialize scores dictionary
scores = {}

# Prepare sets to track ignored columns based on their original names
ignored_columns_gt = set(ground_truth_data[0].keys()) - set(
gt_columns_normalized[norm] for norm in common_columns_normalized
)
ignored_columns_model = set(model_output_data[0].keys()) - set(
model_columns_normalized[norm] for norm in common_columns_normalized
)

# Iterate over common columns using the normalized names to facilitate comparison
for norm_col in common_columns_normalized:
original_gt_col = gt_columns_normalized[norm_col]
original_model_col = model_columns_normalized[norm_col]

gt_annotations = [row[original_gt_col] for row in ground_truth_data]
model_outputs = [row[original_model_col] for row in model_output_data]

column_scores = _compute_scores_for_column(gt_annotations, model_outputs)
_finalize_scores(column_scores, len(ground_truth_data))
scores[original_gt_col] = (
column_scores # Use the ground truth's original column name
)

return scores, ignored_columns_gt, ignored_columns_model
if key_column:
gt_keyed_data = {row[key_column]: row for row in ground_truth_data}
mo_keyed_data = {row[key_column]: row for row in model_output_data}
matched_rows_set = set(gt_keyed_data.keys()).intersection(mo_keyed_data.keys())
unmatched_gt = set(gt_keyed_data.keys()) - matched_rows_set
unmatched_mo = set(mo_keyed_data.keys()) - matched_rows_set
matched_rows = list(
matched_rows_set
) # Ensure this is always a list for consistency
else:
matched_rows = list(range(len(ground_truth_data))) # type: ignore
unmatched_gt = unmatched_mo = set()

for match in matched_rows:
if key_column:
gt_row = gt_keyed_data[match]
mo_row = mo_keyed_data[match]
else:
gt_row = ground_truth_data[int(match)]
mo_row = model_output_data[int(match)]

for norm_col in common_columns_normalized:
original_gt_col = gt_columns_normalized[norm_col]
original_model_col = model_columns_normalized[norm_col]
if original_gt_col in gt_row and original_model_col in mo_row:
gt_annotation = gt_row[original_gt_col]
model_output = mo_row[original_model_col]
if original_gt_col not in scores:
scores[original_gt_col] = _initialize_score_structure()
_update_scores(scores[original_gt_col], gt_annotation, model_output)

_finalize_all_scores(scores, len(matched_rows))

return (
scores,
sorted(ignored_columns_gt),
sorted(ignored_columns_model),
sorted(unmatched_gt),
sorted(unmatched_mo),
)
2 changes: 2 additions & 0 deletions docugami_dfm_benchmarks/utils/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def white_space_fix(text: str) -> str:
return " ".join(text.split())

def remove_punc(text: str) -> str:
text = text.replace("_", " ")
exclude = set(string.punctuation)
return "".join(ch for ch in text if ch not in exclude)

Expand All @@ -21,6 +22,7 @@ def lower(text: str) -> str:

return white_space_fix(remove_articles(remove_punc(lower(text))))


def get_tokens(s: str) -> list[str]:
"""Gets normalized tokens from the given string."""
if not s:
Expand Down
94 changes: 89 additions & 5 deletions tests/utils/test_scorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def test_score_by_column() -> None:
)


def test_score_by_separate_csvs() -> None:
def test_score_by_separate_csvs_aligned() -> None:
ground_truth_data = [
{
"Column1": "Test sentence.",
Expand Down Expand Up @@ -95,13 +95,97 @@ def test_score_by_separate_csvs() -> None:
f"{SIM_TITLE}0.6": 0.5,
},
}
scores, ignored_columns_gt, ignored_columns_model = score_by_separate_csvs(
ground_truth_data, model_output_data
scores, ignored_columns_gt, ignored_columns_model, unmatched_gt, unmatched_mo = (
score_by_separate_csvs(ground_truth_data, model_output_data)
)

assert ignored_columns_gt == {"Unique GT column"}
assert ignored_columns_model == {"Unique MO column"}
assert ignored_columns_gt == ["Unique GT column"]
assert ignored_columns_model == ["Unique MO column"]

assert not unmatched_gt
assert not unmatched_mo

for column in expected_scores:
for metric in expected_scores[column]:
assert np.isclose(
scores[column][metric], expected_scores[column][metric], atol=0.01
), f"Failed on {column} {metric}: expected {expected_scores[column][metric]}, got {scores[column][metric]}"


def test_score_by_separate_csvs_with_key_column() -> None:
# Test data includes a key column named "ID" for matching rows across CSVs
ground_truth_data = [
{
"ID": "1",
"Column1": "Test sentence.",
"Column2": "Another test.",
"Unique GT column": "xyz",
},
{
"ID": "2",
"Column1": "Second sentence.",
"Column2": "Yet another test.",
"Unique GT column": "xyz",
},
{
"ID": "3",
"Column1": "Third unmatched GT sentence.",
"Column2": "Unmatched GT test.",
"Unique GT column": "xyz",
},
]
model_output_data = [
{
"ID": "2",
"Column1": "A different second sentence.",
"Column2": "Yet another test.",
"Unique MO column": "abc",
},
{
"ID": "1",
"Column1": "Test sentence.",
"Column2": "",
"Unique MO column": "abc",
},
{
"ID": "4",
"Column1": "Fourth unmatched MO sentence.",
"Column2": "Unmatched MO test.",
"Unique MO column": "abc",
},
]
expected_scores = {
"Column1": {
"avg_f1": 90.0, # Considering matched rows only
"exact_match": 0.5,
"no_output": 0,
f"{SIM_TITLE}0.8": 1.0,
f"{SIM_TITLE}0.6": 1.0,
},
"Column2": {
"avg_f1": 50.0, # One exact match, one no_output, considering only matched rows
"exact_match": 0.5,
"no_output": 0.5,
f"{SIM_TITLE}0.8": 0.5,
f"{SIM_TITLE}0.6": 0.5,
},
}
key_column = "ID"
scores, ignored_columns_gt, ignored_columns_model, unmatched_gt, unmatched_mo = (
score_by_separate_csvs(
ground_truth_data, model_output_data, key_column=key_column
)
)

# Test for ignored columns
assert ignored_columns_gt == ["Unique GT column"]
assert ignored_columns_model == ["Unique MO column"]

# Test for mismatched rows
assert unmatched_gt == ["3"]
assert unmatched_mo == ["4"]

# Test scores for matched rows
for column in expected_scores:
for metric in expected_scores[column]:
assert np.isclose(
Expand Down
5 changes: 5 additions & 0 deletions tests/utils/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ def test_normalize_with_extra_whitespace() -> None:
assert normalize(" This is a test. ") == "this is test"


def test_normalize_with_special() -> None:
"""Test normalization with special chars."""
assert normalize("AMENDMENT_NUMBER") == "amendment number"


def test_get_tokens_empty() -> None:
"""Test get_tokens returns an empty list for empty input."""
assert get_tokens("") == []
Expand Down

0 comments on commit 844b574

Please sign in to comment.