Skip to content

Commit

Permalink
Fix scikit-learn#27839: Adjust LocalOutlierFactor for data with dupli…
Browse files Browse the repository at this point in the history
…cated samples

Previously, when the dataset had values repeat more times than the algorithm's number of neighbors, it miscalculates the outliers.
Because the distance between the duplicated samples is 0, the local reachability density is equal to 1e10. This leads to values that are close to the duplicated values having a really low negative outlier factor (under -1e7), labeling them as outliers.
This fix checks if the minimum negative outlier factor is under -1e7 and, if so, raises the number of neighbors to the number of occurrences of the most frequent value + 1, also raising a warning.
Notes: Added a handle_duplicates variable, which allows developers to manually handle the duplicate values, if desired. Also added a memory_limit variable to avoid creating memory errors for really large datasets, which can also be changed manually by developers.
  • Loading branch information
HenriqueProj committed Apr 3, 2024
1 parent 30f4d9d commit 69f62fb
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
42 changes: 42 additions & 0 deletions sklearn/neighbors/_lof.py
Expand Up @@ -317,6 +317,48 @@ def fit(self, X, y=None):
self.negative_outlier_factor_, 100.0 * self.contamination
)

# DEV: Assign false to manually handle the duplicate values
handle_duplicates = True

"""
Verify if negative_outlier_factor_ values are within acceptable range
Novelty must also be false to detect outliers
"""
if (
np.min(self.negative_outlier_factor_) < -1e7
and handle_duplicates
and not self.novelty
):
# Raises error in case of 1D sparse array
check_array(X, accept_sparse=False)

# Gets the most frequent element in the array
unique_elements, duplicates = np.unique(X, return_counts=True)
max_duplicates = duplicates.max()

# DEV: Memory usage limit, can raise with risk of Memory Errors
memory_limit = 1e8 # Bytes

# Size needed to recalculate knn with new n_neighbors
kneighbors_array_memory = (max_duplicates + 1) * n_samples * 8 # Bytes

"""
Check if number of duplicates is higher than neighbors
Check memory usage, as it can raise error for really large datasets
"""
if (
max_duplicates >= self.n_neighbors
and kneighbors_array_memory < memory_limit
):
self.n_neighbors = max_duplicates + 1

warnings.warn(
"Duplicate values are leading to incorrect results. "
"Increasing number of neighbors."
)
# Redoes the fitting with the new number of neighbors
return self.fit(X)

return self

def _check_novelty_predict(self):
Expand Down
33 changes: 33 additions & 0 deletions sklearn/neighbors/tests/test_lof.py
Expand Up @@ -359,3 +359,36 @@ def test_lof_dtype_equivalence(algorithm, novelty, contamination):
y_pred_32 = getattr(lof_32, method)(X_32)
y_pred_64 = getattr(lof_64, method)(X_64)
assert_allclose(y_pred_32, y_pred_64, atol=0.0002)


def test_lof_duplicate_samples():
"""
Check that outliers are correct when the data has duplicate values
Test for: https://github.com/scikit-learn/scikit-learn/issues/27839
"""

rng = np.random.default_rng(0)

# 100 times the number of elements of the example shown in the issue
x = rng.permutation(
np.hstack(
[
[0.1] * 1000, # constant values
np.linspace(0.1, 0.3, num=3000),
rng.random(500) * 100, # the clear outliers
]
)
)
X = x.reshape(-1, 1)

lof = neighbors.LocalOutlierFactor(n_neighbors=5, contamination=0.1)
outliers = lof.fit_predict(X)

indices = np.where(outliers == -1)

outliers = X[indices]

# Check that only values outside of the [0.1, 0.3] range are selected
for outlier in outliers:
assert outlier < 0.1 or outlier > 0.3

0 comments on commit 69f62fb

Please sign in to comment.