Skip to content

Commit

Permalink
PSI fixing unit tests (#712)
Browse files Browse the repository at this point in the history
* reformat

* formatting
  • Loading branch information
taylorfturner committed Nov 10, 2022
1 parent 825db8c commit 79da96f
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 41 deletions.
12 changes: 5 additions & 7 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,12 +563,10 @@ def _preprocess_for_calculate_psi(
new_other_histogram["bin_edges"] = other_histogram["bin_edges"]
new_other_histogram["bin_counts"] = other_histogram["bin_counts"]

len_self_bin_counts = 0
if len(self_histogram["bin_counts"]) > 0:
len_self_bin_counts = len(self_histogram["bin_counts"])
len_self_bin_counts = len(self_histogram["bin_counts"])

# re-calculate `self` histogram
if not len_self_bin_counts == num_psi_bins:
if len_self_bin_counts != num_psi_bins:
histogram, hist_loss = self._regenerate_histogram(
bin_counts=self_histogram["bin_counts"],
bin_edges=self_histogram["bin_edges"],
Expand All @@ -583,9 +581,9 @@ def _preprocess_for_calculate_psi(

# re-calculate `other_profile` histogram
histogram_edges_not_equal = False
all_array_values_equal = (
other_histogram["bin_edges"] == self_histogram["bin_edges"]
).all()
all_array_values_equal = np.array_equal(
other_histogram["bin_edges"], self_histogram["bin_edges"]
)
if not all_array_values_equal:
histogram_edges_not_equal = True

Expand Down
103 changes: 103 additions & 0 deletions dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,3 +996,106 @@ def test_diff(self):
str(exc.exception),
"Unsupported operand type(s) for diff: 'TestColumnWProps' and" " 'str'",
)

# PSI same distribution test
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.match_count = 55
other1._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
"bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
},
}

other2.match_count = 550
other2._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) * 10,
"bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
},
}

expected_psi_value = 0
psi_value = other1._calculate_psi(
self_match_count=other1.match_count,
self_histogram=other1._stored_histogram["histogram"],
other_match_count=other2.match_count,
other_histogram=other2._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)

# PSI min_min_edge == max_max_edge
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.match_count = 10
other1._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {"bin_counts": np.array([10]), "bin_edges": np.array([1, 1])},
}

other2.match_count = 20
other2._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {"bin_counts": np.array([20]), "bin_edges": np.array([1, 1])},
}

expected_psi_value = 0
psi_value = other1._calculate_psi(
self_match_count=other1.match_count,
self_histogram=other1._stored_histogram["histogram"],
other_match_count=other2.match_count,
other_histogram=other2._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)

# PSI regen other / not self
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.match_count = 55
other1._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
"bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
},
}

other2.match_count = 20
other2._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([5, 5, 10]),
"bin_edges": np.array([1, 3, 5, 7]),
},
}

expected_psi_value = 0.6617899380349177
psi_value = other1._calculate_psi(
self_match_count=other1.match_count,
self_histogram=other1._stored_histogram["histogram"],
other_match_count=other2.match_count,
other_histogram=other2._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)

# PSI regen self / not other
expected_psi_value = 0.6617899380349177
psi_value = other1._calculate_psi(
self_match_count=other2.match_count,
self_histogram=other2._stored_histogram["histogram"],
other_match_count=other1.match_count,
other_histogram=other1._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)
34 changes: 0 additions & 34 deletions dataprofiler/tests/profilers/test_text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,37 +608,3 @@ def test_diff(self):
places=2,
)
self.assertDictEqual(expected_diff, profile_diff)

# re-create `diamond.csv` categorical column
df = pd.Series(["D", "I", "F", "H", "G"]).apply(str)

df2 = pd.Series(["D", "I", "F", "H", "G"]).apply(str)

profiler1 = TextColumn(df.name)
profiler1.update(df)
profile1 = profiler1.profile

profiler2 = TextColumn(df2.name)
profiler2.update(df2)
profile2 = profiler2.profile

expected_diff = {
"min": "unchanged",
"max": "unchanged",
"sum": "unchanged",
"mean": "unchanged",
"median": "unchanged",
"mode": "unchanged",
"median_absolute_deviation": "unchanged",
"variance": "unchanged",
"stddev": "unchanged",
"t-test": {
"t-statistic": None,
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"vocab": "unchanged",
}

profile_diff = profiler1.diff(profiler2)
self.assertDictEqual(expected_diff, profile_diff)

0 comments on commit 79da96f

Please sign in to comment.