Skip to content

Commit

Permalink
Changed to use pivot tables in the download formatter (#167)
Browse files Browse the repository at this point in the history
Closes #166 

Changed the df_to_download to use pivot tables to generate the csv. This
greatly improves the processing time of this function to just a few
seconds for very large files, and consistently stays there as size
increases.

Also used QUOTE_NONNUMERIC instead of specifically escaping just the
description column.
  • Loading branch information
jcadam14 committed Apr 30, 2024
1 parent defa9c5 commit 7b0aeff
Show file tree
Hide file tree
Showing 4 changed files with 69 additions and 45 deletions.
3 changes: 0 additions & 3 deletions src/regtech_data_validator/create_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,7 @@ def validate_phases(df: pd.DataFrame, context: dict[str, str] | None = None) ->
p1_is_valid, p1_findings = validate(get_phase_1_schema_for_lei(context), df)

if not p1_is_valid:
p1_findings.insert(1, "validation_phase", ValidationPhase.SYNTACTICAL.value, True)
return p1_is_valid, p1_findings, ValidationPhase.SYNTACTICAL.value

p2_is_valid, p2_findings = validate(get_phase_2_schema_for_lei(context), df)
if not p2_is_valid:
p2_findings.insert(1, "validation_phase", ValidationPhase.LOGICAL.value, True)
return p2_is_valid, p2_findings, ValidationPhase.LOGICAL.value
101 changes: 65 additions & 36 deletions src/regtech_data_validator/data_formatters.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import csv
import json
import pandas as pd

Expand All @@ -7,37 +8,65 @@


def df_to_download(df: pd.DataFrame) -> str:
highest_field_count = 0
full_csv = []
if not df.empty:

for _, group_df in df.groupby(['validation_id', 'record_no']):
v_head = group_df.iloc[0]
row_data = [
v_head['validation_severity'],
v_head['validation_id'],
v_head['validation_name'],
str(v_head['record_no'] + 1),
v_head['uid'],
v_head['fig_link'],
f"\"{v_head['validation_desc']}\"",
]
current_count = 0
fields = group_df.iterrows() if v_head['validation_id'] in more_than_2_fields else group_df[::-1].iterrows()
for _, field_data in fields:
row_data.extend([field_data['field_name'], field_data['field_value']])
current_count += 1

full_csv.append(",".join(row_data))
highest_field_count = current_count if current_count > highest_field_count else highest_field_count

field_headers = []
for i in range(highest_field_count):
field_headers.append(f"field_{i+1}")
field_headers.append(f"value_{i+1}")
full_csv.insert(
0,
",".join(
if df.empty:
# return headers of csv for 'emtpy' report
return "validation_type,validation_id,validation_name,row,unique_identifier,fig_link,validation_description"
else:
df.reset_index(drop=True, inplace=True)
df = df.drop(["scope"], axis=1)

df['field_number'] = (
df.groupby(
[
"validation_severity",
"validation_id",
"validation_name",
"record_no",
"uid",
"fig_link",
"validation_desc",
]
).cumcount()
+ 1
)
df_pivot = df.pivot_table(
index=[
"validation_severity",
"validation_id",
"validation_name",
"record_no",
"uid",
"fig_link",
"validation_desc",
],
columns="field_number",
values=["field_name", "field_value"],
aggfunc="first",
).reset_index()

df_pivot.columns = [f"{col[0]}_{col[1]}" if col[1] else col[0] for col in df_pivot.columns]

df_pivot.rename(
columns={f"field_name_{i}": f"field_{i}" for i in range(1, len(df_pivot.columns) // 2 + 1)}, inplace=True
)
df_pivot.rename(
columns={f"field_value_{i}": f"value_{i}" for i in range(1, len(df_pivot.columns) // 2 + 1)}, inplace=True
)
df_pivot.rename(
columns={
"record_no": "row",
"validation_severity": "validation_type",
"uid": "unique_identifier",
"validation_desc": "validation_description",
},
inplace=True,
)

field_columns = [col for col in df_pivot.columns if col.startswith('field_')]
value_columns = [col for col in df_pivot.columns if col.startswith('value_')]
sorted_columns = [col for pair in zip(field_columns, value_columns) for col in pair]

df_pivot = df_pivot[
[
"validation_type",
"validation_id",
Expand All @@ -47,12 +76,12 @@ def df_to_download(df: pd.DataFrame) -> str:
"fig_link",
"validation_description",
]
+ field_headers
),
)
csv_string = "\n".join(full_csv)
+ sorted_columns
]

df_pivot['row'] += 1

return csv_string
return df_pivot.to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC)


def df_to_str(df: pd.DataFrame) -> str:
Expand Down
8 changes: 4 additions & 4 deletions tests/test_output_formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,13 +132,13 @@ def test_output_json(self):
def test_download_csv(self):
expected_output = dedent(
"""
validation_type,validation_id,validation_name,row,unique_identifier,fig_link,validation_description,field_1,value_1
Error,E3000,uid.duplicates_in_dataset,2,12345678901234567890,https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1,"Any 'unique identifier' may not be used in mor...",uid,12345678901234567890
Error,E3000,uid.duplicates_in_dataset,3,12345678901234567890,https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1,"Any 'unique identifier' may not be used in mor...",uid,12345678901234567890
"validation_type","validation_id","validation_name","row","unique_identifier","fig_link","validation_description","field_1","value_1"
"Error","E3000","uid.duplicates_in_dataset",2,"12345678901234567890","https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1","Any 'unique identifier' may not be used in mor...","uid","12345678901234567890"
"Error","E3000","uid.duplicates_in_dataset",3,"12345678901234567890","https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1","Any 'unique identifier' may not be used in mor...","uid","12345678901234567890"
"""
).strip('\n')
actual_output = df_to_download(self.input_df)
assert actual_output == expected_output
assert actual_output.strip() == expected_output

def test_empty_download_csv(self):
expected_output = dedent(
Expand Down
2 changes: 0 additions & 2 deletions tests/test_schema_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,6 @@ def test_with_multi_invalid_data_with_phase1(self):
# should only return phase 1 validation result since phase1 failed
assert not is_valid

assert [ValidationPhase.SYNTACTICAL.value] == findings_df["validation_phase"].unique().tolist()
assert validation_phase == ValidationPhase.SYNTACTICAL.value
assert len(findings_df) == 1

Expand All @@ -233,7 +232,6 @@ def test_with_multi_invalid_data_with_phase2(self):
# since the data passed phase 1 validations
# this should return phase 2 validations
assert not is_valid
assert [ValidationPhase.LOGICAL.value] == findings_df["validation_phase"].unique().tolist()
assert validation_phase == ValidationPhase.LOGICAL.value
assert len(findings_df.index.unique()) == 4

Expand Down

0 comments on commit 7b0aeff

Please sign in to comment.