Changed to use pivot tables in the download formatter (#167)

Closes #166 Changed the df_to_download to use pivot tables to generate the csv. This greatly improves the processing time of this function to just a few seconds for very large files, and consistently stays there as size increases. Also used QUOTE_NONNUMERIC instead of specifically escaping just the description column.
cfpb · Apr 30, 2024 · 7b0aeff · 7b0aeff
1 parent defa9c5
commit 7b0aeff
Show file tree

Hide file tree

Showing 4 changed files with 69 additions and 45 deletions.
diff --git a/src/regtech_data_validator/create_schemas.py b/src/regtech_data_validator/create_schemas.py
@@ -185,10 +185,7 @@ def validate_phases(df: pd.DataFrame, context: dict[str, str] | None = None) ->
     p1_is_valid, p1_findings = validate(get_phase_1_schema_for_lei(context), df)
 
     if not p1_is_valid:
-        p1_findings.insert(1, "validation_phase", ValidationPhase.SYNTACTICAL.value, True)
         return p1_is_valid, p1_findings, ValidationPhase.SYNTACTICAL.value
 
     p2_is_valid, p2_findings = validate(get_phase_2_schema_for_lei(context), df)
-    if not p2_is_valid:
-        p2_findings.insert(1, "validation_phase", ValidationPhase.LOGICAL.value, True)
     return p2_is_valid, p2_findings, ValidationPhase.LOGICAL.value
diff --git a/src/regtech_data_validator/data_formatters.py b/src/regtech_data_validator/data_formatters.py
@@ -1,3 +1,4 @@
+import csv
 import json
 import pandas as pd
 
@@ -7,37 +8,65 @@
 
 
 def df_to_download(df: pd.DataFrame) -> str:
-    highest_field_count = 0
-    full_csv = []
-    if not df.empty:
-
-        for _, group_df in df.groupby(['validation_id', 'record_no']):
-            v_head = group_df.iloc[0]
-            row_data = [
-                v_head['validation_severity'],
-                v_head['validation_id'],
-                v_head['validation_name'],
-                str(v_head['record_no'] + 1),
-                v_head['uid'],
-                v_head['fig_link'],
-                f"\"{v_head['validation_desc']}\"",
-            ]
-            current_count = 0
-            fields = group_df.iterrows() if v_head['validation_id'] in more_than_2_fields else group_df[::-1].iterrows()
-            for _, field_data in fields:
-                row_data.extend([field_data['field_name'], field_data['field_value']])
-                current_count += 1
-
-            full_csv.append(",".join(row_data))
-            highest_field_count = current_count if current_count > highest_field_count else highest_field_count
-
-    field_headers = []
-    for i in range(highest_field_count):
-        field_headers.append(f"field_{i+1}")
-        field_headers.append(f"value_{i+1}")
-    full_csv.insert(
-        0,
-        ",".join(
+    if df.empty:
+        # return headers of csv for 'emtpy' report
+        return "validation_type,validation_id,validation_name,row,unique_identifier,fig_link,validation_description"
+    else:
+        df.reset_index(drop=True, inplace=True)
+        df = df.drop(["scope"], axis=1)
+
+        df['field_number'] = (
+            df.groupby(
+                [
+                    "validation_severity",
+                    "validation_id",
+                    "validation_name",
+                    "record_no",
+                    "uid",
+                    "fig_link",
+                    "validation_desc",
+                ]
+            ).cumcount()
+            + 1
+        )
+        df_pivot = df.pivot_table(
+            index=[
+                "validation_severity",
+                "validation_id",
+                "validation_name",
+                "record_no",
+                "uid",
+                "fig_link",
+                "validation_desc",
+            ],
+            columns="field_number",
+            values=["field_name", "field_value"],
+            aggfunc="first",
+        ).reset_index()
+
+        df_pivot.columns = [f"{col[0]}_{col[1]}" if col[1] else col[0] for col in df_pivot.columns]
+
+        df_pivot.rename(
+            columns={f"field_name_{i}": f"field_{i}" for i in range(1, len(df_pivot.columns) // 2 + 1)}, inplace=True
+        )
+        df_pivot.rename(
+            columns={f"field_value_{i}": f"value_{i}" for i in range(1, len(df_pivot.columns) // 2 + 1)}, inplace=True
+        )
+        df_pivot.rename(
+            columns={
+                "record_no": "row",
+                "validation_severity": "validation_type",
+                "uid": "unique_identifier",
+                "validation_desc": "validation_description",
+            },
+            inplace=True,
+        )
+
+        field_columns = [col for col in df_pivot.columns if col.startswith('field_')]
+        value_columns = [col for col in df_pivot.columns if col.startswith('value_')]
+        sorted_columns = [col for pair in zip(field_columns, value_columns) for col in pair]
+
+        df_pivot = df_pivot[
             [
                 "validation_type",
                 "validation_id",
@@ -47,12 +76,12 @@ def df_to_download(df: pd.DataFrame) -> str:
                 "fig_link",
                 "validation_description",
             ]
-            + field_headers
-        ),
-    )
-    csv_string = "\n".join(full_csv)
+            + sorted_columns
+        ]
+
+        df_pivot['row'] += 1
 
-    return csv_string
+        return df_pivot.to_csv(index=False, quoting=csv.QUOTE_NONNUMERIC)
 
 
 def df_to_str(df: pd.DataFrame) -> str:

diff --git a/tests/test_output_formats.py b/tests/test_output_formats.py
@@ -132,13 +132,13 @@ def test_output_json(self):
     def test_download_csv(self):
         expected_output = dedent(
             """
-            validation_type,validation_id,validation_name,row,unique_identifier,fig_link,validation_description,field_1,value_1
-            Error,E3000,uid.duplicates_in_dataset,2,12345678901234567890,https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1,"Any 'unique identifier' may not be used in mor...",uid,12345678901234567890
-            Error,E3000,uid.duplicates_in_dataset,3,12345678901234567890,https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1,"Any 'unique identifier' may not be used in mor...",uid,12345678901234567890
+            "validation_type","validation_id","validation_name","row","unique_identifier","fig_link","validation_description","field_1","value_1"
+            "Error","E3000","uid.duplicates_in_dataset",2,"12345678901234567890","https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1","Any 'unique identifier' may not be used in mor...","uid","12345678901234567890"
+            "Error","E3000","uid.duplicates_in_dataset",3,"12345678901234567890","https://www.consumerfinance.gov/data-research/small-business-lending/filing-instructions-guide/2024-guide/#4.3.1","Any 'unique identifier' may not be used in mor...","uid","12345678901234567890"
             """
         ).strip('\n')
         actual_output = df_to_download(self.input_df)
-        assert actual_output == expected_output
+        assert actual_output.strip() == expected_output
 
     def test_empty_download_csv(self):
         expected_output = dedent(

diff --git a/tests/test_schema_functions.py b/tests/test_schema_functions.py
@@ -215,7 +215,6 @@ def test_with_multi_invalid_data_with_phase1(self):
         # should only return phase 1 validation result since phase1 failed
         assert not is_valid
 
-        assert [ValidationPhase.SYNTACTICAL.value] == findings_df["validation_phase"].unique().tolist()
         assert validation_phase == ValidationPhase.SYNTACTICAL.value
         assert len(findings_df) == 1
 
@@ -233,7 +232,6 @@ def test_with_multi_invalid_data_with_phase2(self):
         # since the data passed phase 1 validations
         # this should return phase 2 validations
         assert not is_valid
-        assert [ValidationPhase.LOGICAL.value] == findings_df["validation_phase"].unique().tolist()
         assert validation_phase == ValidationPhase.LOGICAL.value
         assert len(findings_df.index.unique()) == 4