PSI fixing unit tests (#712)

* reformat * formatting
capitalone · Nov 10, 2022 · 79da96f · 79da96f
1 parent 825db8c
commit 79da96f
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 41 deletions.
diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
@@ -563,12 +563,10 @@ def _preprocess_for_calculate_psi(
             new_other_histogram["bin_edges"] = other_histogram["bin_edges"]
             new_other_histogram["bin_counts"] = other_histogram["bin_counts"]
 
-            len_self_bin_counts = 0
-            if len(self_histogram["bin_counts"]) > 0:
-                len_self_bin_counts = len(self_histogram["bin_counts"])
+            len_self_bin_counts = len(self_histogram["bin_counts"])
 
             # re-calculate `self` histogram
-            if not len_self_bin_counts == num_psi_bins:
+            if len_self_bin_counts != num_psi_bins:
                 histogram, hist_loss = self._regenerate_histogram(
                     bin_counts=self_histogram["bin_counts"],
                     bin_edges=self_histogram["bin_edges"],
@@ -583,9 +581,9 @@ def _preprocess_for_calculate_psi(
 
             # re-calculate `other_profile` histogram
             histogram_edges_not_equal = False
-            all_array_values_equal = (
-                other_histogram["bin_edges"] == self_histogram["bin_edges"]
-            ).all()
+            all_array_values_equal = np.array_equal(
+                other_histogram["bin_edges"], self_histogram["bin_edges"]
+            )
             if not all_array_values_equal:
                 histogram_edges_not_equal = True
 

diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py
@@ -996,3 +996,106 @@ def test_diff(self):
             str(exc.exception),
             "Unsupported operand type(s) for diff: 'TestColumnWProps' and" " 'str'",
         )
+
+        # PSI same distribution test
+        other1, other2 = TestColumnWProps(), TestColumnWProps()
+        other1.match_count = 55
+        other1._stored_histogram = {
+            "total_loss": 0,
+            "current_loss": 0,
+            "suggested_bin_count": 10,
+            "histogram": {
+                "bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+                "bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+            },
+        }
+
+        other2.match_count = 550
+        other2._stored_histogram = {
+            "total_loss": 0,
+            "current_loss": 0,
+            "suggested_bin_count": 10,
+            "histogram": {
+                "bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) * 10,
+                "bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+            },
+        }
+
+        expected_psi_value = 0
+        psi_value = other1._calculate_psi(
+            self_match_count=other1.match_count,
+            self_histogram=other1._stored_histogram["histogram"],
+            other_match_count=other2.match_count,
+            other_histogram=other2._stored_histogram["histogram"],
+        )
+        self.assertEquals(expected_psi_value, psi_value)
+
+        # PSI min_min_edge == max_max_edge
+        other1, other2 = TestColumnWProps(), TestColumnWProps()
+        other1.match_count = 10
+        other1._stored_histogram = {
+            "total_loss": 0,
+            "current_loss": 0,
+            "suggested_bin_count": 10,
+            "histogram": {"bin_counts": np.array([10]), "bin_edges": np.array([1, 1])},
+        }
+
+        other2.match_count = 20
+        other2._stored_histogram = {
+            "total_loss": 0,
+            "current_loss": 0,
+            "suggested_bin_count": 10,
+            "histogram": {"bin_counts": np.array([20]), "bin_edges": np.array([1, 1])},
+        }
+
+        expected_psi_value = 0
+        psi_value = other1._calculate_psi(
+            self_match_count=other1.match_count,
+            self_histogram=other1._stored_histogram["histogram"],
+            other_match_count=other2.match_count,
+            other_histogram=other2._stored_histogram["histogram"],
+        )
+        self.assertEquals(expected_psi_value, psi_value)
+
+        # PSI regen other / not self
+        other1, other2 = TestColumnWProps(), TestColumnWProps()
+        other1.match_count = 55
+        other1._stored_histogram = {
+            "total_loss": 0,
+            "current_loss": 0,
+            "suggested_bin_count": 10,
+            "histogram": {
+                "bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
+                "bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
+            },
+        }
+
+        other2.match_count = 20
+        other2._stored_histogram = {
+            "total_loss": 0,
+            "current_loss": 0,
+            "suggested_bin_count": 10,
+            "histogram": {
+                "bin_counts": np.array([5, 5, 10]),
+                "bin_edges": np.array([1, 3, 5, 7]),
+            },
+        }
+
+        expected_psi_value = 0.6617899380349177
+        psi_value = other1._calculate_psi(
+            self_match_count=other1.match_count,
+            self_histogram=other1._stored_histogram["histogram"],
+            other_match_count=other2.match_count,
+            other_histogram=other2._stored_histogram["histogram"],
+        )
+        self.assertEquals(expected_psi_value, psi_value)
+
+        # PSI regen self / not other
+        expected_psi_value = 0.6617899380349177
+        psi_value = other1._calculate_psi(
+            self_match_count=other2.match_count,
+            self_histogram=other2._stored_histogram["histogram"],
+            other_match_count=other1.match_count,
+            other_histogram=other1._stored_histogram["histogram"],
+        )
+        self.assertEquals(expected_psi_value, psi_value)
diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py
@@ -608,37 +608,3 @@ def test_diff(self):
             places=2,
         )
         self.assertDictEqual(expected_diff, profile_diff)
-
-        # re-create `diamond.csv` categorical column
-        df = pd.Series(["D", "I", "F", "H", "G"]).apply(str)
-
-        df2 = pd.Series(["D", "I", "F", "H", "G"]).apply(str)
-
-        profiler1 = TextColumn(df.name)
-        profiler1.update(df)
-        profile1 = profiler1.profile
-
-        profiler2 = TextColumn(df2.name)
-        profiler2.update(df2)
-        profile2 = profiler2.profile
-
-        expected_diff = {
-            "min": "unchanged",
-            "max": "unchanged",
-            "sum": "unchanged",
-            "mean": "unchanged",
-            "median": "unchanged",
-            "mode": "unchanged",
-            "median_absolute_deviation": "unchanged",
-            "variance": "unchanged",
-            "stddev": "unchanged",
-            "t-test": {
-                "t-statistic": None,
-                "conservative": {"df": None, "p-value": None},
-                "welch": {"df": None, "p-value": None},
-            },
-            "vocab": "unchanged",
-        }
-
-        profile_diff = profiler1.diff(profiler2)
-        self.assertDictEqual(expected_diff, profile_diff)